From b8bd8757e483541f38ca29837ea033194014356e Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Sat, 11 Jun 2022 09:37:24 +0200 Subject: [PATCH 01/68] MAINT Implement CSR support for all DistanceMetric --- sklearn/metrics/_dist_metrics.pxd.tp | 61 +- sklearn/metrics/_dist_metrics.pyx.tp | 1277 +++++++++++++++++++- sklearn/metrics/tests/test_dist_metrics.py | 70 +- 3 files changed, 1331 insertions(+), 77 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 32ba546672c6e..2ba4545dc02fc 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -79,8 +79,8 @@ cdef class DistanceMetric{{name_suffix}}: # Because we don't expect to instantiate a lot of these objects, the # extra memory overhead of this setup should not be an issue. cdef {{DTYPE_t}} p - cdef {{DTYPE_t}}[::1] vec - cdef {{DTYPE_t}}[:, ::1] mat + cdef DTYPE_t[::1] vec + cdef DTYPE_t[:, ::1] mat cdef ITYPE_t size cdef object func cdef object kwargs @@ -91,10 +91,59 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1 - cdef int pdist(self, const {{DTYPE_t}}[:, ::1] X, {{DTYPE_t}}[:, ::1] D) except -1 - - cdef int cdist(self, const {{DTYPE_t}}[:, ::1] X, const {{DTYPE_t}}[:, ::1] Y, - {{DTYPE_t}}[:, ::1] D) except -1 + cdef DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1 + + cdef DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1 + + cdef int pdist(self, + const {{DTYPE_t}}[:, ::1] X, + DTYPE_t[:, ::1] D, + ) except -1 + + cdef int cdist(self, + const {{DTYPE_t}}[:, ::1] X, + const {{DTYPE_t}}[:, ::1] Y, + DTYPE_t[:, ::1] D, + ) except -1 + + cdef int csr_pdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const cnp.int32_t[:] x1_indptr, + const ITYPE_t size, + DTYPE_t[:, ::1] D, + ) nogil except -1 + + cdef int csr_cdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const cnp.int32_t[:] x1_indptr, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t[:] x2_indptr, + const ITYPE_t size, + DTYPE_t[:, ::1] D, + ) nogil except -1 cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1 diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 5986fa939b45d..51108a18500ca 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -250,8 +250,8 @@ cdef class DistanceMetric{{name_suffix}}: """ def __cinit__(self): self.p = 2 - self.vec = np.zeros(1, dtype={{DTYPE}}, order='C') - self.mat = np.zeros((1, 1), dtype={{DTYPE}}, order='C') + self.vec = np.zeros(1, dtype=DTYPE, order='C') + self.mat = np.zeros((1, 1), dtype=DTYPE, order='C') self.size = 1 def __reduce__(self): @@ -356,8 +356,11 @@ cdef class DistanceMetric{{name_suffix}}: """ return self.dist(x1, x2, size) - cdef int pdist(self, const {{DTYPE_t}}[:, ::1] X, {{DTYPE_t}}[:, ::1] D) except -1: - """compute the pairwise distances between points in X""" + cdef int pdist(self, + const {{DTYPE_t}}[:, ::1] X, + DTYPE_t[:, ::1] D, + ) except -1: + """Compute the pairwise distances between points in X""" cdef ITYPE_t i1, i2 for i1 in range(X.shape[0]): for i2 in range(i1, X.shape[0]): @@ -365,9 +368,13 @@ cdef class DistanceMetric{{name_suffix}}: D[i2, i1] = D[i1, i2] return 0 - cdef int cdist(self, const {{DTYPE_t}}[:, ::1] X, const {{DTYPE_t}}[:, ::1] Y, - {{DTYPE_t}}[:, ::1] D) except -1: - """compute the cross-pairwise distances between arrays X and Y""" + + cdef int cdist(self, + const {{DTYPE_t}}[:, ::1] X, + const {{DTYPE_t}}[:, ::1] Y, + DTYPE_t[:, ::1] D, + ) except -1: + """Compute the cross-pairwise distances between arrays X and Y""" cdef ITYPE_t i1, i2 if X.shape[1] != Y.shape[1]: raise ValueError('X and Y must have the same second dimension') @@ -376,6 +383,136 @@ cdef class DistanceMetric{{name_suffix}}: D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) return 0 + cdef DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + """Compute the distance between vectors x1 and x2 represented + under the CSR format. + + This must be overridden in a base class. + + Note that we pass all the parameter as to not use memoryview slicing + because it is currently known to slow down execution as it + takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 + """ + return -999 + + cdef DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + """Compute the distance between vectors x1 and x2 represented + under the CSR format. + + This can optionally be overridden in a base class. + + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For example, the + rank-preserving surrogate distance of the Euclidean metric is the + squared-euclidean distance. + + Note that we pass all the parameter as to not use memoryview slicing + because it is currently known to slow down execution as it + takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 + """ + return self.csr_dist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ) + + cdef int csr_pdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const cnp.int32_t[:] x1_indptr, + const ITYPE_t size, + DTYPE_t[:, ::1] D, + ) nogil except -1: + """Compute the pairwise distances between points in X + represented in the CSR format.""" + cdef: + ITYPE_t i1, i2 + ITYPE_t n_x1 = x1_indptr.shape[0] - 1 + ITYPE_t x1_start, x1_end, x2_start, x2_end + + for i1 in range(n_x1): + x1_start = x1_indptr[i1] + x1_end = x1_indptr[i1 + 1] + for i2 in range(i1, n_x1): + x2_start = x1_indptr[i2] + x2_end = x1_indptr[i2 + 1] + D[i1, i2] = D[i2, i1] = self.csr_dist( + x1_data, + x1_indices, + x1_data, + x1_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ) + return 0 + + cdef int csr_cdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const cnp.int32_t[:] x1_indptr, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t[:] x2_indptr, + const ITYPE_t size, + DTYPE_t[:, ::1] D, + ) nogil except -1: + """Compute the cross-pairwise distances between arrays X and Y + represented in the CSR format.""" + cdef: + ITYPE_t i1, i2 + ITYPE_t n_x1 = x1_indptr.shape[0] - 1 + ITYPE_t n_x2 = x2_indptr.shape[0] - 1 + ITYPE_t x1_start, x1_end, x2_start, x2_end + + for i1 in range(n_x1): + x1_start = x1_indptr[i1] + x1_end = x1_indptr[i1 + 1] + for i2 in range(n_x2): + x2_start = x2_indptr[i2] + x2_end = x2_indptr[i2 + 1] + + D[i1, i2] = self.csr_dist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ) + return 0 + cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: """Convert the rank-preserving surrogate distance to the distance""" return rdist @@ -424,6 +561,61 @@ cdef class DistanceMetric{{name_suffix}}: """ return dist + def _pairwise_dense(self, X, Y=None): + cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Xarr + cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Yarr + cdef cnp.ndarray[DTYPE_t, ndim=2, mode='c'] Darr + + Xarr = np.asarray(X, dtype={{DTYPE}}, order='C') + self._validate_data(Xarr) + if Y is None: + Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), + dtype=DTYPE, order='C') + self.pdist(Xarr, Darr) + else: + Yarr = np.asarray(Y, dtype={{DTYPE}}, order='C') + self._validate_data(Yarr) + Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), + dtype=DTYPE, order='C') + self.cdist(Xarr, Yarr, Darr) + return Darr + + def _pairwise_sparse(self, X, Y=None): + X_csr = X.tocsr() + n_X, size = X_csr.shape + X_data = np.asarray(X_csr.data, dtype={{DTYPE}}) + X_indices = np.asarray(X_csr.indices, dtype=np.int32) + X_indptr = np.asarray(X_csr.indptr, dtype=np.int32) + + if Y is None: + Darr = np.zeros((n_X, n_X), dtype=DTYPE, order='C') + self.csr_pdist( + x1_data=X_data, + x1_indices=X_indices, + x1_indptr=X_indptr, + size=size, + D=Darr, + ) + else: + Y_csr = Y.tocsr() + n_Y, _ = Y_csr.shape + Y_data = np.asarray(Y_csr.data, dtype={{DTYPE}}) + Y_indices = np.asarray(Y_csr.indices, dtype=np.int32) + Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32) + + Darr = np.zeros((n_X, n_Y), dtype=DTYPE, order='C') + self.csr_cdist( + x1_data=X_data, + x1_indices=X_indices, + x1_indptr=X_indptr, + x2_data=Y_data, + x2_indices=Y_indices, + x2_indptr=Y_indptr, + size=size, + D=Darr, + ) + return Darr + def pairwise(self, X, Y=None): """Compute the pairwise distances between X and Y @@ -445,23 +637,10 @@ cdef class DistanceMetric{{name_suffix}}: The shape (Nx, Ny) array of pairwise distances between points in X and Y. """ - cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Xarr - cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Yarr - cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Darr - - Xarr = np.asarray(X, dtype={{DTYPE}}, order='C') - self._validate_data(Xarr) - if Y is None: - Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), - dtype={{DTYPE}}, order='C') - self.pdist(Xarr, Darr) - else: - Yarr = np.asarray(Y, dtype={{DTYPE}}, order='C') - self._validate_data(Yarr) - Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), - dtype={{DTYPE}}, order='C') - self.cdist(Xarr, Yarr, Darr) - return Darr + if not issparse(X) and (Y is None or not issparse(Y)): + return self._pairwise_dense(X, Y) + if issparse(X) and (Y is None or issparse(Y)): + return self._pairwise_sparse(X, Y) #------------------------------------------------------------ @@ -496,6 +675,79 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 + cdef inline DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + DTYPE_t unsquared = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + unsquared = (x1_data[i1] - x2_data[i2]) + d = d + (unsquared * unsquared) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) + i1 = i1 + 1 + else: + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) + i2 = i2 + 1 + else: + while i1 < x1_end: + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) + i1 = i1 + 1 + + return d + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return sqrt(self.csr_rdist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + )) #------------------------------------------------------------ # SEuclidean Distance @@ -507,7 +759,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} } """ def __init__(self, V): - self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype={{DTYPE}})) + self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype=DTYPE)) self.size = self.vec.shape[0] self.p = 2 @@ -540,6 +792,80 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 + cdef inline DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + DTYPE_t unsquared = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + unsquared = (x1_data[i1] - x2_data[i2]) + d = d + (unsquared * unsquared) / self.vec[ix1] + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) / self.vec[ix1] + i1 = i1 + 1 + else: + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) / self.vec[ix2] + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) / self.vec[ix1] + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) / self.vec[ix2] + i1 = i1 + 1 + return d + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return sqrt(self.csr_rdist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + )) #------------------------------------------------------------ # Manhattan Distance @@ -561,6 +887,51 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += fabs(x1[j] - x2[j]) return d + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + fabs(x1_data[i1] - x2_data[i2]) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + fabs(x1_data[i1]) + i1 = i1 + 1 + else: + d = d + fabs(x2_data[i2]) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d = d + fabs(x2_data[i2]) + i2 = i2 + 1 + else: + while i1 < x1_end: + d = d + fabs(x1_data[i1]) + i1 = i1 + 1 + + return d + #------------------------------------------------------------ # Chebyshev Distance @@ -595,6 +966,52 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = fmax(d, fabs(x1_data[i1])) + i1 = i1 + 1 + else: + d = fmax(d, fabs(x2_data[i2])) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d = fmax(d, fabs(x2_data[i2])) + i2 = i2 + 1 + else: + while i1 < x1_end: + d = fmax(d, fabs(x1_data[i1])) + i1 = i1 + 1 + + return d + + #------------------------------------------------------------ # Minkowski Distance cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): @@ -632,14 +1049,14 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): self.p = p if w is not None: w_array = check_array( - w, ensure_2d=False, dtype={{DTYPE}}, input_name="w" + w, ensure_2d=False, dtype=DTYPE, input_name="w" ) if (w_array < 0).any(): raise ValueError("w cannot contain negative weights") self.vec = ReadonlyArrayWrapper(w_array) self.size = self.vec.shape[0] else: - self.vec = ReadonlyArrayWrapper(np.asarray([], dtype={{DTYPE}})) + self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=DTYPE)) self.size = 0 def _validate_data(self, X): @@ -677,6 +1094,106 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** self.p + cdef inline DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + bint has_w = self.size > 0 + + if has_w: + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + else: + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + else: + while i1 < x1_end: + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + + return d + else: + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + (pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + (pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + else: + d = d + (pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + d = d + (pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + d = d + (pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + + return d + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return pow( + self.csr_rdist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ), + 1 / self.p + ) #------------------------------------------------------------ # TODO: Remove in 1.3 - WMinkowskiDistance class @@ -714,7 +1231,7 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise ValueError("WMinkowskiDistance requires finite p. " "For p=inf, use ChebyshevDistance.") self.p = p - self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype={{DTYPE}})) + self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype=DTYPE)) self.size = self.vec.shape[0] def _validate_data(self, X): @@ -747,6 +1264,78 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** self.p + cdef inline DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + pow(self.vec[ix1] * fabs(x1_data[i1] - x2_data[i2]), self.p) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + i1 = i1 + 1 + else: + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + i1 = i1 + 1 + + return d + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return pow( + self.csr_rdist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ), + 1 / self.p + ) #------------------------------------------------------------ # Mahalanobis Distance @@ -775,12 +1364,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if VI.ndim != 2 or VI.shape[0] != VI.shape[1]: raise ValueError("V/VI must be square") - self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype={{DTYPE}}, order='C')) + self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype=DTYPE, order='C')) self.size = self.mat.shape[0] # we need vec as a work buffer - self.vec = np.zeros(self.size, dtype={{DTYPE}}) + self.vec = np.zeros(self.size, dtype=DTYPE) def _validate_data(self, X): if X.shape[1] != self.size: @@ -818,6 +1407,81 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 + cdef inline DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t tmp, d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + self.vec[ix1] = x1_data[i1] - x2_data[i2] + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + self.vec[ix1] = x1_data[i1] + i1 = i1 + 1 + else: + self.vec[ix2] = - x2_data[i2] + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + self.vec[ix2] = - x2_data[i2] + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + self.vec[ix1] = x1_data[i1] + i1 = i1 + 1 + + for i in range(size): + tmp = 0 + for j in range(size): + tmp += self.mat[i, j] * self.vec[j] + d += tmp * self.vec[i] + + return d + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return sqrt(self.csr_rdist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + )) #------------------------------------------------------------ # Hamming Distance @@ -841,6 +1505,54 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return float(n_unequal) / size + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d += (x1_data[i1] != x2_data[i2]) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d += 1 + i1 = i1 + 1 + else: + d += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + d += 1 + i1 = i1 + 1 + + d /= size + + return d + + #------------------------------------------------------------ # Canberra Distance # D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ] @@ -863,6 +1575,50 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (fabs(x1[j] - x2[j])) / denom return d + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d += fabs(x1_data[i1] - x2_data[i2]) / (fabs(x1_data[i1]) + fabs(x2_data[i2])) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d += 1. + i1 = i1 + 1 + else: + d += 1. + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d += 1. + i2 = i2 + 1 + else: + while i1 < x1_end: + d += 1. + i1 = i1 + 1 + + return d #------------------------------------------------------------ # Bray-Curtis Distance @@ -888,6 +1644,56 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): else: return 0.0 + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t num = 0.0 + DTYPE_t denom = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + num += fabs(x1_data[i1] - x2_data[i2]) + denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) + i1 = i1 + 1 + else: + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) + i2 = i2 + 1 + else: + while i1 < x1_end: + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) + i1 = i1 + 1 + + return num / denom #------------------------------------------------------------ # Jaccard Distance (boolean) @@ -903,20 +1709,73 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_eq = 0, nnz = 0 + cdef int tf1, tf2, n_tt = 0, nnz = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 nnz += (tf1 or tf2) - n_eq += (tf1 and tf2) + n_tt += (tf1 and tf2) # Based on https://github.com/scipy/scipy/pull/7373 # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric # was changed to return 0, instead of nan. if nnz == 0: return 0 - return (nnz - n_eq) * 1.0 / nnz + return (nnz - n_tt) * 1.0 / nnz + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_tt = 0, nnz = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + nnz += (tf1 or tf2) + n_tt += (tf1 and tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + nnz += tf1 + i1 = i1 + 1 + else: + nnz += tf2 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + nnz += tf2 + i2 = i2 + 1 + else: + while i1 < x1_end: + nnz += tf1 + i1 = i1 + 1 + # Based on https://github.com/scipy/scipy/pull/7373 + # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric + # was changed to return 0, instead of nan. + if nnz == 0: + return 0 + return (nnz - n_tt) * 1.0 / nnz #------------------------------------------------------------ # Matching Distance (boolean) @@ -940,6 +1799,52 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return n_neq * 1. / size + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + + return n_neq * 1.0 / size #------------------------------------------------------------ # Dice Distance (boolean) @@ -956,14 +1861,63 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_neq = 0, ntt = 0 + cdef int tf1, tf2, n_neq = 0, n_tt = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 - ntt += (tf1 and tf2) + n_tt += (tf1 and tf2) n_neq += (tf1 != tf2) - return n_neq / (2.0 * ntt + n_neq) + return n_neq / (2.0 * n_tt + n_neq) + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + + return n_neq / (2.0 * n_tt + n_neq) #------------------------------------------------------------ @@ -981,15 +1935,63 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0, n_neq = 0 + cdef int tf1, tf2, n_tt = 0, n_neq = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 n_neq += (tf1 != tf2) - ntt += (tf1 and tf2) - return (n_neq - ntt + size) * 1.0 / (n_neq + size) + n_tt += (tf1 and tf2) + return (n_neq - n_tt + size) * 1.0 / (n_neq + size) + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + return (n_neq - n_tt + size) * 1.0 / (n_neq + size) #------------------------------------------------------------ # Rogers-Tanimoto Distance (boolean) @@ -1013,6 +2015,53 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + + return (2.0 * n_neq) / (size + n_neq) #------------------------------------------------------------ # Russell-Rao Distance (boolean) @@ -1028,13 +2077,55 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0 + cdef int tf1, tf2, n_tt = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 - ntt += (tf1 and tf2) - return (size - ntt) * 1. / size + n_tt += (tf1 and tf2) + return (size - n_tt) * 1. / size + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_tt = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + i1 = i1 + 1 + else: + i2 = i2 + 1 + + # We don't need to go through all the longuest + # vector because tf1 or tf2 will be false + # and thus n_tt won't be increased. + + return (size - n_tt) * 1. / size + #------------------------------------------------------------ @@ -1059,6 +2150,53 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + + return (2.0 * n_neq) / (size + n_neq) #------------------------------------------------------------ # Sokal-Sneath Distance (boolean) @@ -1074,14 +2212,63 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0, n_neq = 0 + cdef int tf1, tf2, n_tt = 0, n_neq = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 n_neq += (tf1 != tf2) - ntt += (tf1 and tf2) - return n_neq / (0.5 * ntt + n_neq) + n_tt += (tf1 and tf2) + return n_neq / (0.5 * n_tt + n_neq) + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + + return n_neq / (0.5 * n_tt + n_neq) #------------------------------------------------------------ diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 4cc8b945ffdab..8f1ddd662ca3f 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -83,14 +83,15 @@ def test_cdist(metric_param_grid, X, Y): ) metric, param_grid = metric_param_grid keys = param_grid.keys() + X_csr, Y_csr = sp.csr_matrix(X), sp.csr_matrix(Y) for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) + rtol_dict = {} if metric == "mahalanobis": - # See: https://github.com/scipy/scipy/issues/13861 - # Possibly caused by: https://github.com/joblib/joblib/issues/563 - pytest.xfail( - "scipy#13861: cdist with 'mahalanobis' fails on joblib memmap data" - ) + # Computation of mahalanobis differs between + # the scipy and scikit-learn implementation. + # Hence, we increase the relative tolerance. + rtol_dict = {"rtol": 1e-6} if metric == "wminkowski": # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 @@ -104,7 +105,10 @@ def test_cdist(metric_param_grid, X, Y): dm = DistanceMetricInterface.get_metric(metric, **kwargs) D_sklearn = dm.pairwise(X, Y) - assert_allclose(D_sklearn, D_scipy_cdist) + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) + + D_sklearn_csr = dm.pairwise(X_csr, Y_csr) + assert_allclose(D_sklearn_csr, D_scipy_cdist, **rtol_dict) @pytest.mark.parametrize("metric", BOOL_METRICS) @@ -112,28 +116,38 @@ def test_cdist(metric_param_grid, X, Y): "X_bool, Y_bool", [(X_bool, Y_bool), (X_bool_mmap, Y_bool_mmap)] ) def test_cdist_bool_metric(metric, X_bool, Y_bool): - D_true = cdist(X_bool, Y_bool, metric) + D_scipy_cdist = cdist(X_bool, Y_bool, metric) + dm = DistanceMetric.get_metric(metric) - D12 = dm.pairwise(X_bool, Y_bool) - assert_allclose(D12, D_true) + D_sklearn = dm.pairwise(X_bool, Y_bool) + assert_allclose(D_sklearn, D_scipy_cdist) + + X_bool_csr, Y_csr = sp.csr_matrix(X_bool), sp.csr_matrix(Y_bool) + D_sklearn_csr = dm.pairwise(X_bool_csr, Y_csr) + assert_allclose(D_sklearn_csr, D_scipy_cdist) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) -@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)]) -def test_pdist(metric_param_grid, X, Y): +@pytest.mark.parametrize("X", [X64, X32, X_mmap]) +def test_pdist(metric_param_grid, X): DistanceMetricInterface = ( - DistanceMetric if X.dtype == Y.dtype == np.float64 else DistanceMetric32 + DistanceMetric if X.dtype == np.float64 else DistanceMetric32 ) metric, param_grid = metric_param_grid keys = param_grid.keys() + X_csr = sp.csr_matrix(X) for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) + rtol_dict = {} if metric == "mahalanobis": - # See: https://github.com/scipy/scipy/issues/13861 - pytest.xfail("scipy#13861: pdist with 'mahalanobis' fails onmemmap data") - elif metric == "wminkowski": + # Computation of mahalanobis differs between + # the scipy and scikit-learn implementation. + # Hence, we increase the relative tolerance. + rtol_dict = {"rtol": 1e-6} + + if metric == "wminkowski": if sp_version >= parse_version("1.8.0"): pytest.skip("wminkowski will be removed in SciPy 1.8.0") @@ -142,13 +156,16 @@ def test_pdist(metric_param_grid, X, Y): if sp_version >= parse_version("1.6.0"): ExceptionToAssert = DeprecationWarning with pytest.warns(ExceptionToAssert): - D_true = cdist(X, X, metric, **kwargs) + D_scipy_pdist = cdist(X, X, metric, **kwargs) else: - D_true = cdist(X, X, metric, **kwargs) + D_scipy_pdist = cdist(X, X, metric, **kwargs) dm = DistanceMetricInterface.get_metric(metric, **kwargs) - D12 = dm.pairwise(X) - assert_allclose(D12, D_true) + D_sklearn = dm.pairwise(X) + assert_allclose(D_sklearn, D_scipy_pdist, **rtol_dict) + + D_sklearn_csr = dm.pairwise(X_csr) + assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @@ -166,25 +183,26 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): D64 = dm64.pairwise(X64) D32 = dm32.pairwise(X32) - assert_allclose(D64, D32) + assert_allclose(D64, D32, rtol=1e-5) D64 = dm64.pairwise(X64, Y64) D32 = dm32.pairwise(X32, Y32) - assert_allclose(D64, D32) + assert_allclose(D64, D32, rtol=1e-5) @pytest.mark.parametrize("metric", BOOL_METRICS) @pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap]) def test_pdist_bool_metrics(metric, X_bool): - D_true = cdist(X_bool, X_bool, metric) - dm = DistanceMetric.get_metric(metric) - D12 = dm.pairwise(X_bool) + D_scipy_pdist = cdist(X_bool, X_bool, metric) # Based on https://github.com/scipy/scipy/pull/7373 # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric # was changed to return 0, instead of nan. if metric == "jaccard" and sp_version < parse_version("1.2.0"): - D_true[np.isnan(D_true)] = 0 - assert_allclose(D12, D_true) + D_scipy_pdist[np.isnan(D_scipy_pdist)] = 0 + + dm = DistanceMetric.get_metric(metric) + D_sklearn = dm.pairwise(X_bool) + assert_allclose(D_sklearn, D_scipy_pdist) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed From fb9968017ec55c050b974bf62decc83454f75314 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 15 Jun 2022 09:50:18 +0200 Subject: [PATCH 02/68] TST Remove useless guard --- sklearn/metrics/tests/test_dist_metrics.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 8f1ddd662ca3f..e1273a900d90f 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -194,12 +194,6 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): @pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap]) def test_pdist_bool_metrics(metric, X_bool): D_scipy_pdist = cdist(X_bool, X_bool, metric) - # Based on https://github.com/scipy/scipy/pull/7373 - # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric - # was changed to return 0, instead of nan. - if metric == "jaccard" and sp_version < parse_version("1.2.0"): - D_scipy_pdist[np.isnan(D_scipy_pdist)] = 0 - dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool) assert_allclose(D_sklearn, D_scipy_pdist) From d39d2b2486b39a7cefe07a2ea0caeab329614edc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 15 Jun 2022 10:57:52 +0200 Subject: [PATCH 03/68] TST Skip JaccardDistance on 32bit architecture --- sklearn/metrics/_dist_metrics.pyx.tp | 6 +++--- sklearn/metrics/tests/test_dist_metrics.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 51108a18500ca..154c242eb66b2 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -1709,19 +1709,19 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_tt = 0, nnz = 0 + cdef int tf1, tf2, n_eq = 0, nnz = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 nnz += (tf1 or tf2) - n_tt += (tf1 and tf2) + n_eq += (tf1 and tf2) # Based on https://github.com/scipy/scipy/pull/7373 # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric # was changed to return 0, instead of nan. if nnz == 0: return 0 - return (nnz - n_tt) * 1.0 / nnz + return (nnz - n_eq) * 1.0 / nnz cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index e1273a900d90f..d939dd431a01a 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -17,7 +17,7 @@ DistanceMetric32, ) -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, _IS_32BIT from sklearn.utils._testing import create_memmap_backed_data from sklearn.utils.fixes import sp_version, parse_version @@ -118,6 +118,9 @@ def test_cdist(metric_param_grid, X, Y): def test_cdist_bool_metric(metric, X_bool, Y_bool): D_scipy_cdist = cdist(X_bool, Y_bool, metric) + if metric == "jaccard" and _IS_32BIT: + pytest.skip("Jaccard Distance on 32bit architecture is unstable.") + dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool, Y_bool) assert_allclose(D_sklearn, D_scipy_cdist) From 011e2a2ad072908215a827b59058745dc8ce2c1f Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 16 Jun 2022 16:49:38 +0200 Subject: [PATCH 04/68] MAINT Define dtype alias for sparse matrices indices --- sklearn/metrics/_dist_metrics.pxd.tp | 38 ++-- sklearn/metrics/_dist_metrics.pyx.tp | 312 +++++++++++++-------------- sklearn/utils/_typedefs.pxd | 10 + sklearn/utils/_typedefs.pyx | 3 + 4 files changed, 188 insertions(+), 175 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 2ba4545dc02fc..ba257e89f02d4 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -28,7 +28,7 @@ implementation_specific_values = [ cimport numpy as cnp from libc.math cimport sqrt, exp -from ..utils._typedefs cimport DTYPE_t, ITYPE_t +from ..utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t {{for name_suffix, DTYPE_t, DTYPE in implementation_specific_values}} @@ -93,25 +93,25 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1 cdef DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1 @@ -128,19 +128,19 @@ cdef class DistanceMetric{{name_suffix}}: cdef int csr_pdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, - const cnp.int32_t[:] x1_indptr, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1 cdef int csr_cdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, - const cnp.int32_t[:] x1_indptr, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indptr, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t[:] x2_indptr, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1 diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 154c242eb66b2..d0a1328fc3e1e 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -385,13 +385,13 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: """Compute the distance between vectors x1 and x2 represented @@ -407,13 +407,13 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: """Compute the distance between vectors x1 and x2 represented @@ -444,8 +444,8 @@ cdef class DistanceMetric{{name_suffix}}: cdef int csr_pdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, - const cnp.int32_t[:] x1_indptr, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1: @@ -477,11 +477,11 @@ cdef class DistanceMetric{{name_suffix}}: cdef int csr_cdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, - const cnp.int32_t[:] x1_indptr, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indptr, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t[:] x2_indptr, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1: @@ -677,13 +677,13 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -728,13 +728,13 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: return sqrt(self.csr_rdist( @@ -794,13 +794,13 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -846,13 +846,13 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: return sqrt(self.csr_rdist( @@ -889,13 +889,13 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -968,13 +968,13 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1096,13 +1096,13 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1171,13 +1171,13 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: return pow( @@ -1266,13 +1266,13 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1313,13 +1313,13 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: return pow( @@ -1409,13 +1409,13 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1462,13 +1462,13 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: return sqrt(self.csr_rdist( @@ -1507,13 +1507,13 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1577,13 +1577,13 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1646,13 +1646,13 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1725,13 +1725,13 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1801,13 +1801,13 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1872,13 +1872,13 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1946,13 +1946,13 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -2017,13 +2017,13 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -2087,13 +2087,13 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -2152,13 +2152,13 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -2223,13 +2223,13 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: diff --git a/sklearn/utils/_typedefs.pxd b/sklearn/utils/_typedefs.pxd index ee0c8ca3b57e9..9c2db8cf32c4a 100644 --- a/sklearn/utils/_typedefs.pxd +++ b/sklearn/utils/_typedefs.pxd @@ -15,3 +15,13 @@ cdef enum: ctypedef cnp.intp_t ITYPE_t # WARNING: should match ITYPE in typedefs.pyx ctypedef cnp.int32_t INT32TYPE_t # WARNING: should match INT32TYPE in typedefs.pyx ctypedef cnp.int64_t INT64TYPE_t # WARNING: should match INT32TYPE in typedefs.pyx + +# scipy matrices indices dtype (namely for indptr and indices arrays) +# +# Note that indices might need to be represented as cnp.int64_t. +# Currently, we use Cython classes which do not handle fused types +# so we hardcode this type to cnp.int32_t, supporting all but edge +# cases. +# +# TODO: support cnp.int64_t for this case +ctypedef cnp.int32_t SPARSE_INDEX_TYPE_t diff --git a/sklearn/utils/_typedefs.pyx b/sklearn/utils/_typedefs.pyx index 09e5a6a44944a..839aa4e5fde83 100644 --- a/sklearn/utils/_typedefs.pyx +++ b/sklearn/utils/_typedefs.pyx @@ -19,6 +19,9 @@ INT64TYPE = np.int64 # WARNING: this should match INT64TYPE_t in typedefs.pxd #DTYPE = np.asarray(ddummy_view).dtype DTYPE = np.float64 # WARNING: this should match DTYPE_t in typedefs.pxd +# WARNING: this must match SPARSE_INDEX_TYPE_t in typedefs.pxd +SPARSE_INDEX_TYPE = np.float32 + # some handy constants cdef DTYPE_t INF = np.inf cdef DTYPE_t PI = np.pi From a579630f99a9c13cdace51e86fc281625bd37368 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 16 Jun 2022 17:16:51 +0200 Subject: [PATCH 05/68] MAINT Do not shadow dtype names in Tempita templating --- sklearn/metrics/_dist_metrics.pxd.tp | 56 ++-- sklearn/metrics/_dist_metrics.pyx.tp | 373 +++++++++++++++++---------- 2 files changed, 261 insertions(+), 168 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index ba257e89f02d4..af3b84cac9d4a 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -3,7 +3,7 @@ implementation_specific_values = [ # Values are the following ones: # - # name_suffix, DTYPE_t, DTYPE + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE # # On the first hand, an empty string is used for `name_suffix` # for the float64 case as to still be able to expose the original @@ -30,7 +30,7 @@ from libc.math cimport sqrt, exp from ..utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t -{{for name_suffix, DTYPE_t, DTYPE in implementation_specific_values}} +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} ###################################################################### # Inline distance functions @@ -38,8 +38,8 @@ from ..utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t # We use these for the default (euclidean) case so that they can be # inlined. This leads to faster computation for the most common case cdef inline DTYPE_t euclidean_dist{{name_suffix}}( - const {{DTYPE_t}}* x1, - const {{DTYPE_t}}* x2, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: cdef DTYPE_t tmp, d=0 @@ -51,8 +51,8 @@ cdef inline DTYPE_t euclidean_dist{{name_suffix}}( cdef inline DTYPE_t euclidean_rdist{{name_suffix}}( - const {{DTYPE_t}}* x1, - const {{DTYPE_t}}* x2, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: cdef DTYPE_t tmp, d=0 @@ -63,11 +63,11 @@ cdef inline DTYPE_t euclidean_rdist{{name_suffix}}( return d -cdef inline DTYPE_t euclidean_dist_to_rdist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1: +cdef inline DTYPE_t euclidean_dist_to_rdist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) nogil except -1: return dist * dist -cdef inline DTYPE_t euclidean_rdist_to_dist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1: +cdef inline DTYPE_t euclidean_rdist_to_dist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) nogil except -1: return sqrt(dist) @@ -78,23 +78,29 @@ cdef class DistanceMetric{{name_suffix}}: # we must define them here so that cython's limited polymorphism will work. # Because we don't expect to instantiate a lot of these objects, the # extra memory overhead of this setup should not be an issue. - cdef {{DTYPE_t}} p + cdef {{INPUT_DTYPE_t}} p cdef DTYPE_t[::1] vec cdef DTYPE_t[:, ::1] mat cdef ITYPE_t size cdef object func cdef object kwargs - cdef DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1 + cdef DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1 - cdef DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1 + cdef DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1 cdef DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -104,9 +110,9 @@ cdef class DistanceMetric{{name_suffix}}: ) nogil except -1 cdef DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -116,18 +122,18 @@ cdef class DistanceMetric{{name_suffix}}: ) nogil except -1 cdef int pdist(self, - const {{DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] X, DTYPE_t[:, ::1] D, ) except -1 cdef int cdist(self, - const {{DTYPE_t}}[:, ::1] X, - const {{DTYPE_t}}[:, ::1] Y, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, DTYPE_t[:, ::1] D, ) except -1 cdef int csr_pdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, @@ -135,19 +141,19 @@ cdef class DistanceMetric{{name_suffix}}: ) nogil except -1 cdef int csr_cdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1 - cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1 + cdef DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1 - cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1 + cdef DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1 {{endfor}} diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index d0a1328fc3e1e..d85a369b0cb39 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -3,7 +3,7 @@ implementation_specific_values = [ # Values are the following ones: # - # name_suffix, DTYPE_t, DTYPE + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE # # # On the first hand, an empty string is used for `name_suffix` @@ -87,7 +87,7 @@ def get_valid_metric_ids(L): if (val.__name__ in L) or (val in L)] -{{for name_suffix, DTYPE_t, DTYPE in implementation_specific_values}} +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} ###################################################################### # metric mappings @@ -120,7 +120,7 @@ METRIC_MAPPING{{name_suffix}} = { 'pyfunc': PyFuncDistance{{name_suffix}}, } -cdef inline cnp.ndarray _buffer_to_ndarray{{name_suffix}}(const {{DTYPE_t}}* x, cnp.npy_intp n): +cdef inline cnp.ndarray _buffer_to_ndarray{{name_suffix}}(const {{INPUT_DTYPE_t}}* x, cnp.npy_intp n): # Wrap a memory buffer with an ndarray. Warning: this is not robust. # In particular, if x is deallocated before the returned array goes # out of scope, this could cause memory errors. Since there is not @@ -130,7 +130,7 @@ cdef inline cnp.ndarray _buffer_to_ndarray{{name_suffix}}(const {{DTYPE_t}}* x, return PyArray_SimpleNewFromData(1, &n, DTYPECODE, x) -cdef {{DTYPE_t}} INF{{name_suffix}} = np.inf +cdef {{INPUT_DTYPE_t}} INF{{name_suffix}} = np.inf ###################################################################### @@ -335,16 +335,22 @@ cdef class DistanceMetric{{name_suffix}}: """ return - cdef DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: """Compute the distance between vectors x1 and x2 This should be overridden in a base class. """ return -999 - cdef DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: """Compute the rank-preserving surrogate distance between vectors x1 and x2. This can optionally be overridden in a base class. @@ -357,7 +363,7 @@ cdef class DistanceMetric{{name_suffix}}: return self.dist(x1, x2, size) cdef int pdist(self, - const {{DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] X, DTYPE_t[:, ::1] D, ) except -1: """Compute the pairwise distances between points in X""" @@ -370,8 +376,8 @@ cdef class DistanceMetric{{name_suffix}}: cdef int cdist(self, - const {{DTYPE_t}}[:, ::1] X, - const {{DTYPE_t}}[:, ::1] Y, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, DTYPE_t[:, ::1] D, ) except -1: """Compute the cross-pairwise distances between arrays X and Y""" @@ -384,9 +390,9 @@ cdef class DistanceMetric{{name_suffix}}: return 0 cdef DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -406,9 +412,9 @@ cdef class DistanceMetric{{name_suffix}}: return -999 cdef DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -443,7 +449,7 @@ cdef class DistanceMetric{{name_suffix}}: ) cdef int csr_pdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, @@ -476,10 +482,10 @@ cdef class DistanceMetric{{name_suffix}}: return 0 cdef int csr_cdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, @@ -513,11 +519,11 @@ cdef class DistanceMetric{{name_suffix}}: ) return 0 - cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: """Convert the rank-preserving surrogate distance to the distance""" return rdist - cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: """Convert the distance to the rank-preserving surrogate distance""" return dist @@ -562,18 +568,18 @@ cdef class DistanceMetric{{name_suffix}}: return dist def _pairwise_dense(self, X, Y=None): - cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Xarr - cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Yarr + cdef cnp.ndarray[{{INPUT_DTYPE_t}}, ndim=2, mode='c'] Xarr + cdef cnp.ndarray[{{INPUT_DTYPE_t}}, ndim=2, mode='c'] Yarr cdef cnp.ndarray[DTYPE_t, ndim=2, mode='c'] Darr - Xarr = np.asarray(X, dtype={{DTYPE}}, order='C') + Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Xarr) if Y is None: Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), dtype=DTYPE, order='C') self.pdist(Xarr, Darr) else: - Yarr = np.asarray(Y, dtype={{DTYPE}}, order='C') + Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Yarr) Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), dtype=DTYPE, order='C') @@ -583,7 +589,7 @@ cdef class DistanceMetric{{name_suffix}}: def _pairwise_sparse(self, X, Y=None): X_csr = X.tocsr() n_X, size = X_csr.shape - X_data = np.asarray(X_csr.data, dtype={{DTYPE}}) + X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) X_indices = np.asarray(X_csr.indices, dtype=np.int32) X_indptr = np.asarray(X_csr.indptr, dtype=np.int32) @@ -599,7 +605,7 @@ cdef class DistanceMetric{{name_suffix}}: else: Y_csr = Y.tocsr() n_Y, _ = Y_csr.shape - Y_data = np.asarray(Y_csr.data, dtype={{DTYPE}}) + Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}}) Y_indices = np.asarray(Y_csr.indices, dtype=np.int32) Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32) @@ -655,18 +661,24 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = 2 - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return euclidean_dist{{name_suffix}}(x1, x2, size) - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return euclidean_rdist{{name_suffix}}(x1, x2, size) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return sqrt(rdist) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: return dist * dist def rdist_to_dist(self, rdist): @@ -676,9 +688,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return dist ** 2 cdef inline DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -727,9 +739,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -767,8 +779,11 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('SEuclidean dist: size of V does not match') - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t tmp, d=0 cdef cnp.intp_t j for j in range(size): @@ -776,14 +791,17 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (tmp * tmp / self.vec[j]) return d - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return sqrt(self.rdist(x1, x2, size)) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return sqrt(rdist) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: return dist * dist def rdist_to_dist(self, rdist): @@ -793,9 +811,9 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return dist ** 2 cdef inline DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -817,37 +835,37 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2] if ix1 == ix2: - unsquared = (x1_data[i1] - x2_data[i2]) + unsquared = (x1_data[i1] - x2_data[i2]) d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - unsquared = x1_data[i1] + unsquared = x1_data[i1] d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 else: - unsquared = x2_data[i2] + unsquared = x2_data[i2] d = d + (unsquared * unsquared) / self.vec[ix2] i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: ix2 = x2_indices[i2] - unsquared = x2_data[i2] + unsquared = x2_data[i2] d = d + (unsquared * unsquared) / self.vec[ix1] i2 = i2 + 1 else: while i1 < x1_end: ix1 = x1_indices[i1] - unsquared = x1_data[i1] + unsquared = x1_data[i1] d = d + (unsquared * unsquared) / self.vec[ix2] i1 = i1 + 1 return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -879,8 +897,11 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = 1 - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): @@ -888,9 +909,9 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -957,8 +978,11 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = INF{{name_suffix}} - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): @@ -967,9 +991,9 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1065,8 +1089,11 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): f"the number of features ({X.shape[1]}). " f"Currently len(w)={self.size}.") - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t d=0 cdef cnp.intp_t j cdef bint has_w = self.size > 0 @@ -1078,14 +1105,17 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (pow(fabs(x1[j] - x2[j]), self.p)) return d - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return pow(self.rdist(x1, x2, size), 1. / self.p) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return pow(rdist, 1. / self.p) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: return pow(dist, self.p) def rdist_to_dist(self, rdist): @@ -1095,9 +1125,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return dist ** self.p cdef inline DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1170,9 +1200,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1239,8 +1269,11 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise ValueError('WMinkowskiDistance dist: ' 'size of w does not match') - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t d = 0 cdef cnp.intp_t j @@ -1248,14 +1281,17 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p)) return d - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return pow(self.rdist(x1, x2, size), 1. / self.p) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return pow(rdist, 1. / self.p) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: return pow(dist, self.p) def rdist_to_dist(self, rdist): @@ -1265,9 +1301,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return dist ** self.p cdef inline DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1312,9 +1348,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1375,8 +1411,11 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('Mahalanobis dist: size of V does not match') - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t tmp, d = 0 cdef cnp.intp_t i, j @@ -1391,14 +1430,17 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += tmp * self.vec[i] return d - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return sqrt(self.rdist(x1, x2, size)) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return sqrt(rdist) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: return dist * dist def rdist_to_dist(self, rdist): @@ -1408,9 +1450,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return dist ** 2 cdef inline DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1461,9 +1503,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1495,8 +1537,11 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int n_unequal = 0 cdef cnp.intp_t j for j in range(size): @@ -1506,9 +1551,9 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1565,8 +1610,11 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t denom, d = 0 cdef cnp.intp_t j for j in range(size): @@ -1576,9 +1624,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1632,8 +1680,11 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t num = 0, denom = 0 cdef cnp.intp_t j for j in range(size): @@ -1645,9 +1696,9 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return 0.0 cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1707,8 +1758,11 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_eq = 0, nnz = 0 cdef cnp.intp_t j for j in range(size): @@ -1724,9 +1778,9 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (nnz - n_eq) * 1.0 / nnz cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1789,8 +1843,11 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / N """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_neq = 0 cdef cnp.intp_t j for j in range(size): @@ -1800,9 +1857,9 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return n_neq * 1. / size cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1859,8 +1916,11 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_neq = 0, n_tt = 0 cdef cnp.intp_t j for j in range(size): @@ -1871,9 +1931,9 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return n_neq / (2.0 * n_tt + n_neq) cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1933,8 +1993,11 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 1 - N_TT / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_tt = 0, n_neq = 0 cdef cnp.intp_t j for j in range(size): @@ -1945,9 +2008,9 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (n_neq - n_tt + size) * 1.0 / (n_neq + size) cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2005,8 +2068,11 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_neq = 0 cdef cnp.intp_t j for j in range(size): @@ -2016,9 +2082,9 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (2.0 * n_neq) / (size + n_neq) cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2075,8 +2141,11 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N - N_TT) / N """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_tt = 0 cdef cnp.intp_t j for j in range(size): @@ -2086,9 +2155,9 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (size - n_tt) * 1. / size cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2140,8 +2209,11 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_neq = 0 cdef cnp.intp_t j for j in range(size): @@ -2151,9 +2223,9 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (2.0 * n_neq) / (size + n_neq) cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2210,8 +2282,11 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT / 2 + N_FT + N_TF) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_tt = 0, n_neq = 0 cdef cnp.intp_t j for j in range(size): @@ -2222,9 +2297,9 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return n_neq / (0.5 * n_tt + n_neq) cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2292,20 +2367,26 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise ValueError("Haversine distance only valid " "in 2 dimensions") - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return 2 * asin(sqrt(self.rdist(x1, x2, size))) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return 2 * asin(sqrt(rdist)) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: cdef DTYPE_t tmp = sin(0.5 * dist) return tmp * tmp @@ -2338,12 +2419,18 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The # only way to be back compatible is to inherit `dist` from the base class # without GIL and called an inline `_dist` which acquire GIL. - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return self._dist(x1, x2, size) - cdef inline DTYPE_t _dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) except -1 with gil: + cdef inline DTYPE_t _dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) except -1 with gil: cdef cnp.ndarray x1arr cdef cnp.ndarray x2arr x1arr = _buffer_to_ndarray{{name_suffix}}(x1, size) From 98e9d21efc6d76f3a441e0bca5e31cc846de9b8c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 16 Jun 2022 17:46:40 +0200 Subject: [PATCH 06/68] fixup! MAINT Define dtype alias for sparse matrices indices --- sklearn/metrics/_dist_metrics.pyx.tp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index d85a369b0cb39..ea5e079c19133 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -86,6 +86,7 @@ def get_valid_metric_ids(L): return [key for (key, val) in METRIC_MAPPING.items() if (val.__name__ in L) or (val in L)] +from ..utils._typedefs import SPARSE_INDEX_TYPE {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -590,8 +591,8 @@ cdef class DistanceMetric{{name_suffix}}: X_csr = X.tocsr() n_X, size = X_csr.shape X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) - X_indices = np.asarray(X_csr.indices, dtype=np.int32) - X_indptr = np.asarray(X_csr.indptr, dtype=np.int32) + X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) if Y is None: Darr = np.zeros((n_X, n_X), dtype=DTYPE, order='C') @@ -606,8 +607,8 @@ cdef class DistanceMetric{{name_suffix}}: Y_csr = Y.tocsr() n_Y, _ = Y_csr.shape Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}}) - Y_indices = np.asarray(Y_csr.indices, dtype=np.int32) - Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32) + Y_indices = np.asarray(Y_csr.indices, dtype=SPARSE_INDEX_TYPE) + Y_indptr = np.asarray(Y_csr.indptr, dtype=SPARSE_INDEX_TYPE) Darr = np.zeros((n_X, n_Y), dtype=DTYPE, order='C') self.csr_cdist( From 8aa4e44791714d0a94ae353685a0ee021b15df58 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 16 Jun 2022 18:05:59 +0200 Subject: [PATCH 07/68] TST Use cdist and pdist appropriately --- sklearn/metrics/tests/test_dist_metrics.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index d939dd431a01a..ee62d1d1f53a4 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -8,7 +8,7 @@ import pytest import scipy.sparse as sp -from scipy.spatial.distance import cdist +from scipy.spatial.distance import cdist, pdist from sklearn.metrics import DistanceMetric from sklearn.metrics._dist_metrics import ( @@ -87,7 +87,7 @@ def test_cdist(metric_param_grid, X, Y): for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) rtol_dict = {} - if metric == "mahalanobis": + if metric == "mahalanobis" and X.dtype == np.float32: # Computation of mahalanobis differs between # the scipy and scikit-learn implementation. # Hence, we increase the relative tolerance. @@ -119,7 +119,7 @@ def test_cdist_bool_metric(metric, X_bool, Y_bool): D_scipy_cdist = cdist(X_bool, Y_bool, metric) if metric == "jaccard" and _IS_32BIT: - pytest.skip("Jaccard Distance on 32bit architecture is unstable.") + pytest.xfail("Jaccard Distance on 32bit architecture is unstable.") dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool, Y_bool) @@ -144,7 +144,7 @@ def test_pdist(metric_param_grid, X): for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) rtol_dict = {} - if metric == "mahalanobis": + if metric == "mahalanobis" and X.dtype == np.float32: # Computation of mahalanobis differs between # the scipy and scikit-learn implementation. # Hence, we increase the relative tolerance. @@ -159,9 +159,9 @@ def test_pdist(metric_param_grid, X): if sp_version >= parse_version("1.6.0"): ExceptionToAssert = DeprecationWarning with pytest.warns(ExceptionToAssert): - D_scipy_pdist = cdist(X, X, metric, **kwargs) + D_scipy_pdist = pdist(X, metric, **kwargs) else: - D_scipy_pdist = cdist(X, X, metric, **kwargs) + D_scipy_pdist = pdist(X, metric, **kwargs) dm = DistanceMetricInterface.get_metric(metric, **kwargs) D_sklearn = dm.pairwise(X) @@ -196,7 +196,7 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): @pytest.mark.parametrize("metric", BOOL_METRICS) @pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap]) def test_pdist_bool_metrics(metric, X_bool): - D_scipy_pdist = cdist(X_bool, X_bool, metric) + D_scipy_pdist = pdist(X_bool, metric) dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool) assert_allclose(D_sklearn, D_scipy_pdist) From 9edfa11625cf119b534b5878dc739560ee5d05db Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 09:49:19 +0200 Subject: [PATCH 08/68] DOC Improve comments Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index ea5e079c19133..3273a48d9708e 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -423,10 +423,9 @@ cdef class DistanceMetric{{name_suffix}}: const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: - """Compute the distance between vectors x1 and x2 represented - under the CSR format. + """Distance between rows of CSR matrices x1 and x2. - This can optionally be overridden in a base class. + This can optionally be overridden in a subclass. The rank-preserving surrogate distance is any measure that yields the same rank as the distance, but is more efficient to compute. For example, the @@ -456,8 +455,11 @@ cdef class DistanceMetric{{name_suffix}}: const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1: - """Compute the pairwise distances between points in X - represented in the CSR format.""" + """Pairwise distances between rows in CSR matrix X. + + Note that this implementation is twice faster than csr_cdist(X, X) + because it leverages the symmetry of the problem. + """ cdef: ITYPE_t i1, i2 ITYPE_t n_x1 = x1_indptr.shape[0] - 1 From ee5c6bf598e1b1224c1225a5127eb6628e7602b7 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 09:46:58 +0200 Subject: [PATCH 09/68] Fixups --- sklearn/metrics/tests/test_dist_metrics.py | 12 ++++++++---- sklearn/utils/_typedefs.pyx | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index ee62d1d1f53a4..78e4a28ccbbd9 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -8,7 +8,7 @@ import pytest import scipy.sparse as sp -from scipy.spatial.distance import cdist, pdist +from scipy.spatial.distance import cdist from sklearn.metrics import DistanceMetric from sklearn.metrics._dist_metrics import ( @@ -91,6 +91,8 @@ def test_cdist(metric_param_grid, X, Y): # Computation of mahalanobis differs between # the scipy and scikit-learn implementation. # Hence, we increase the relative tolerance. + # TODO: Inspect slight numerical discrepancy + # with scipy rtol_dict = {"rtol": 1e-6} if metric == "wminkowski": @@ -148,6 +150,8 @@ def test_pdist(metric_param_grid, X): # Computation of mahalanobis differs between # the scipy and scikit-learn implementation. # Hence, we increase the relative tolerance. + # TODO: Inspect slight numerical discrepancy + # with scipy rtol_dict = {"rtol": 1e-6} if metric == "wminkowski": @@ -159,9 +163,9 @@ def test_pdist(metric_param_grid, X): if sp_version >= parse_version("1.6.0"): ExceptionToAssert = DeprecationWarning with pytest.warns(ExceptionToAssert): - D_scipy_pdist = pdist(X, metric, **kwargs) + D_scipy_pdist = cdist(X, X, metric, **kwargs) else: - D_scipy_pdist = pdist(X, metric, **kwargs) + D_scipy_pdist = cdist(X, X, metric, **kwargs) dm = DistanceMetricInterface.get_metric(metric, **kwargs) D_sklearn = dm.pairwise(X) @@ -196,7 +200,7 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): @pytest.mark.parametrize("metric", BOOL_METRICS) @pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap]) def test_pdist_bool_metrics(metric, X_bool): - D_scipy_pdist = pdist(X_bool, metric) + D_scipy_pdist = cdist(X_bool, X_bool, metric) dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool) assert_allclose(D_sklearn, D_scipy_pdist) diff --git a/sklearn/utils/_typedefs.pyx b/sklearn/utils/_typedefs.pyx index 839aa4e5fde83..49d0e46101b4f 100644 --- a/sklearn/utils/_typedefs.pyx +++ b/sklearn/utils/_typedefs.pyx @@ -20,7 +20,7 @@ INT64TYPE = np.int64 # WARNING: this should match INT64TYPE_t in typedefs.pxd DTYPE = np.float64 # WARNING: this should match DTYPE_t in typedefs.pxd # WARNING: this must match SPARSE_INDEX_TYPE_t in typedefs.pxd -SPARSE_INDEX_TYPE = np.float32 +SPARSE_INDEX_TYPE = np.int32 # some handy constants cdef DTYPE_t INF = np.inf From bf5eb597a5783cc8c7a90615b235949dd4ca8ea6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 11:05:23 +0200 Subject: [PATCH 10/68] MAINT Wrap of indptr values to support sparse-dense This is kind of an hack for now. IMO, it would be better to use a flatiter on a view if possible. See discussions on: https://groups.google.com/g/cython-users/c/MR4xWCvUKHU Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 286 +++++++++++++++++---- sklearn/metrics/tests/test_dist_metrics.py | 39 ++- 2 files changed, 269 insertions(+), 56 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 3273a48d9708e..739a16d09b8d2 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -456,7 +456,7 @@ cdef class DistanceMetric{{name_suffix}}: DTYPE_t[:, ::1] D, ) nogil except -1: """Pairwise distances between rows in CSR matrix X. - + Note that this implementation is twice faster than csr_cdist(X, X) because it leverages the symmetry of the problem. """ @@ -570,34 +570,34 @@ cdef class DistanceMetric{{name_suffix}}: """ return dist - def _pairwise_dense(self, X, Y=None): + def _pairwise_dense_dense(self, X, Y): cdef cnp.ndarray[{{INPUT_DTYPE_t}}, ndim=2, mode='c'] Xarr cdef cnp.ndarray[{{INPUT_DTYPE_t}}, ndim=2, mode='c'] Yarr cdef cnp.ndarray[DTYPE_t, ndim=2, mode='c'] Darr Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Xarr) - if Y is None: - Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), + if X is Y: + Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype=DTYPE, order='C') self.pdist(Xarr, Darr) else: Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Yarr) - Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), + Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype=DTYPE, order='C') self.cdist(Xarr, Yarr, Darr) return Darr - def _pairwise_sparse(self, X, Y=None): + def _pairwise_sparse_sparse(self, X, Y): X_csr = X.tocsr() n_X, size = X_csr.shape X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) - if Y is None: - Darr = np.zeros((n_X, n_X), dtype=DTYPE, order='C') + if X is Y: + Darr = np.empty((n_X, n_X), dtype=DTYPE, order='C') self.csr_pdist( x1_data=X_data, x1_indices=X_indices, @@ -612,7 +612,7 @@ cdef class DistanceMetric{{name_suffix}}: Y_indices = np.asarray(Y_csr.indices, dtype=SPARSE_INDEX_TYPE) Y_indptr = np.asarray(Y_csr.indptr, dtype=SPARSE_INDEX_TYPE) - Darr = np.zeros((n_X, n_Y), dtype=DTYPE, order='C') + Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.csr_cdist( x1_data=X_data, x1_indices=X_indices, @@ -625,6 +625,31 @@ cdef class DistanceMetric{{name_suffix}}: ) return Darr + def _pairwise_sparse_dense(self, X, Y): + X_csr = X.tocsr() + n_X, size = X_csr.shape + X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) + X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) + + n_Y, _ = Y.shape + Y_data = Y.reshape(-1) + Y_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) + Y_indptr = np.arange(stop=size*(n_Y + 1), step=size, dtype=SPARSE_INDEX_TYPE) + + Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') + self.csr_cdist( + x1_data=X_data, + x1_indices=X_indices, + x1_indptr=X_indptr, + x2_data=Y_data, + x2_indices=Y_indices, + x2_indptr=Y_indptr, + size=size, + D=Darr, + ) + return Darr + def pairwise(self, X, Y=None): """Compute the pairwise distances between X and Y @@ -646,11 +671,20 @@ cdef class DistanceMetric{{name_suffix}}: The shape (Nx, Ny) array of pairwise distances between points in X and Y. """ - if not issparse(X) and (Y is None or not issparse(Y)): - return self._pairwise_dense(X, Y) - if issparse(X) and (Y is None or issparse(Y)): - return self._pairwise_sparse(X, Y) + Y = X if Y is None else Y + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + + if not X_is_sparse and not Y_is_sparse: + return self._pairwise_dense_dense(X, Y) + if X_is_sparse and Y_is_sparse: + return self._pairwise_sparse_sparse(X, Y) + if X_is_sparse and not Y_is_sparse: + return self._pairwise_sparse_dense(X, Y) + if not X_is_sparse and Y_is_sparse: + # Swapping argument and transposing the result + return self._pairwise_sparse_dense(Y, X).T #------------------------------------------------------------ # Euclidean Distance @@ -706,13 +740,21 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 DTYPE_t unsquared = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: unsquared = (x1_data[i1] - x2_data[i2]) @@ -829,13 +871,21 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 DTYPE_t unsquared = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: unsquared = (x1_data[i1] - x2_data[i2]) @@ -927,12 +977,20 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d = d + fabs(x1_data[i1] - x2_data[i2]) @@ -1009,12 +1067,20 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) @@ -1143,14 +1209,22 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 bint has_w = self.size > 0 if has_w: while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d = d + (self.vec[ix1] * pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) @@ -1175,8 +1249,14 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d else: while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d = d + (pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) @@ -1319,12 +1399,20 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d = d + pow(self.vec[ix1] * fabs(x1_data[i1] - x2_data[i2]), self.p) @@ -1468,12 +1556,20 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t tmp, d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: self.vec[ix1] = x1_data[i1] - x2_data[i2] @@ -1569,12 +1665,20 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d += (x1_data[i1] != x2_data[i2]) @@ -1642,12 +1746,20 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d += fabs(x1_data[i1] - x2_data[i2]) / (fabs(x1_data[i1]) + fabs(x2_data[i2])) @@ -1714,13 +1826,21 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t num = 0.0 DTYPE_t denom = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: num += fabs(x1_data[i1] - x2_data[i2]) @@ -1796,12 +1916,20 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, nnz = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -1875,12 +2003,20 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: tf1 = x1_data[i1] != 0 @@ -1949,12 +2085,20 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2026,12 +2170,20 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2100,12 +2252,20 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2173,12 +2333,20 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2241,12 +2409,20 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2315,12 +2491,20 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 78e4a28ccbbd9..6528ed5f381f5 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -106,11 +106,20 @@ def test_cdist(metric_param_grid, X, Y): D_scipy_cdist = cdist(X, Y, metric, **kwargs) dm = DistanceMetricInterface.get_metric(metric, **kwargs) + + # DistanceMetric.pairwise must be consistent + # on all combinations of format in {sparse, dense}². D_sklearn = dm.pairwise(X, Y) assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) - D_sklearn_csr = dm.pairwise(X_csr, Y_csr) - assert_allclose(D_sklearn_csr, D_scipy_cdist, **rtol_dict) + D_sklearn = dm.pairwise(X_csr, Y_csr) + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) + + D_sklearn = dm.pairwise(X_csr, Y) + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) + + D_sklearn = dm.pairwise(X, Y_csr) + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) @pytest.mark.parametrize("metric", BOOL_METRICS) @@ -127,9 +136,22 @@ def test_cdist_bool_metric(metric, X_bool, Y_bool): D_sklearn = dm.pairwise(X_bool, Y_bool) assert_allclose(D_sklearn, D_scipy_cdist) - X_bool_csr, Y_csr = sp.csr_matrix(X_bool), sp.csr_matrix(Y_bool) - D_sklearn_csr = dm.pairwise(X_bool_csr, Y_csr) - assert_allclose(D_sklearn_csr, D_scipy_cdist) + # DistanceMetric.pairwise must be consistent + # on all combinations of format in {sparse, dense}². + X_bool_csr, Y_bool_csr = sp.csr_matrix(X_bool), sp.csr_matrix(Y_bool) + + D_sklearn = dm.pairwise(X_bool, Y_bool) + assert_allclose(D_sklearn, D_scipy_cdist) + + D_sklearn = dm.pairwise(X_bool_csr, Y_bool_csr) + assert_allclose(D_sklearn, D_scipy_cdist) + + # TODO: inspect failures on Boolean DistanceMetric + # D_sklearn = dm.pairwise(X_bool, Y_bool_csr) + # assert_allclose(D_sklearn, D_scipy_cdist) + + # D_sklearn = dm.pairwise(X_bool_csr, Y_bool) + # assert_allclose(D_sklearn, D_scipy_cdist) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @@ -174,6 +196,9 @@ def test_pdist(metric_param_grid, X): D_sklearn_csr = dm.pairwise(X_csr) assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) + D_sklearn_csr = dm.pairwise(X_csr, X_csr) + assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) + # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @@ -205,6 +230,10 @@ def test_pdist_bool_metrics(metric, X_bool): D_sklearn = dm.pairwise(X_bool) assert_allclose(D_sklearn, D_scipy_pdist) + X_bool_csr = sp.csr_matrix(X_bool) + D_sklearn = dm.pairwise(X_bool_csr) + assert_allclose(D_sklearn, D_scipy_pdist) + # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") From 92b8a6c8d26c1059c4af1c5101e96823fe56d114 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 14:23:07 +0200 Subject: [PATCH 11/68] Apply review comments Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 168 +++++++++++---------------- sklearn/utils/_typedefs.pxd | 1 + 2 files changed, 66 insertions(+), 103 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 739a16d09b8d2..2945a99290386 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -632,10 +632,23 @@ cdef class DistanceMetric{{name_suffix}}: X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) + # To avoid introducing redundant implementations for the CSR × dense array + # case, we wrap the dense array into a fake CSR datastructure and leverage + # the existing code for the CSR × CSR case. + # The true CSR representation of a dense array would require allocating + # a Y_indices matrix of shape (n_samples, n_features) with repeated + # contiguous integers from 0 to n_features - 1 on each row which would + # be very wasteful from a memory point of view. Instead we only allocate + # a single row and adapt the CSR × CSR routines to use a modulo operation + # when accessing Y_indices in order to achieve the same result without having + # to materialize the indices repetition explicitly. + n_Y, _ = Y.shape Y_data = Y.reshape(-1) Y_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) - Y_indptr = np.arange(stop=size*(n_Y + 1), step=size, dtype=SPARSE_INDEX_TYPE) + Y_indptr = np.arange( + start=0, stop=size * (n_Y + 1), step=size, dtype=SPARSE_INDEX_TYPE + ) Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.csr_cdist( @@ -747,12 +760,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t unsquared = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -878,12 +888,9 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t unsquared = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -983,12 +990,9 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1073,12 +1077,9 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1405,12 +1406,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1562,12 +1560,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t tmp, d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1671,12 +1666,9 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1752,12 +1744,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1833,12 +1822,9 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t denom = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1922,12 +1908,9 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_tt = 0, nnz = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2009,12 +1992,9 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2091,12 +2071,9 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2176,12 +2153,9 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2258,12 +2232,9 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2339,12 +2310,9 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_tt = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2415,12 +2383,9 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2497,12 +2462,9 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] diff --git a/sklearn/utils/_typedefs.pxd b/sklearn/utils/_typedefs.pxd index 9c2db8cf32c4a..a6e390705496b 100644 --- a/sklearn/utils/_typedefs.pxd +++ b/sklearn/utils/_typedefs.pxd @@ -24,4 +24,5 @@ ctypedef cnp.int64_t INT64TYPE_t # WARNING: should match INT32TYPE in typedefs. # cases. # # TODO: support cnp.int64_t for this case +# See: https://github.com/scikit-learn/scikit-learn/issues/23653 ctypedef cnp.int32_t SPARSE_INDEX_TYPE_t From dc6f8cf70ec138ea3dbc32337ba072a65e6abba1 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 17 Jun 2022 15:52:28 +0200 Subject: [PATCH 12/68] More interesting boolean data for tests --- sklearn/metrics/tests/test_dist_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 6528ed5f381f5..c97114117c72f 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -38,8 +38,8 @@ def dist_func(x1, x2, p): [X_mmap, Y_mmap] = create_memmap_backed_data([X64, Y64]) # make boolean arrays: ones and zeros -X_bool = X64.round(0) -Y_bool = Y64.round(0) +X_bool = (X64 < 0.3).astype(np.float64) # quite sparse +Y_bool = (Y64 < 0.7).astype(np.float64) # not too sparse [X_bool_mmap, Y_bool_mmap] = create_memmap_backed_data([X_bool, Y_bool]) From bb06f592a3b8ac3e7d988a75a1ad24ae60d7e576 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 15:49:38 +0200 Subject: [PATCH 13/68] FIX Various corrections --- sklearn/metrics/_dist_metrics.pyx.tp | 38 +++++++++------------- sklearn/metrics/tests/test_dist_metrics.py | 4 +-- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 2945a99290386..5ea2a836b756b 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -910,15 +910,15 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2] + ix2 = x2_indices[i2 % len_x2_indices] unsquared = x2_data[i2] - d = d + (unsquared * unsquared) / self.vec[ix1] + d = d + (unsquared * unsquared) / self.vec[ix2] i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1] + ix1 = x1_indices[i1 % len_x1_indices] unsquared = x1_data[i1] - d = d + (unsquared * unsquared) / self.vec[ix2] + d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 return d @@ -1218,12 +1218,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if has_w: while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1240,22 +1237,21 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: + ix2 = x2_indices[i2 % len_x2_indices] d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: + ix1 = x1_indices[i1 % len_x1_indices] d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d else: while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1272,12 +1268,10 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2] d = d + (pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1] d = d + (pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 @@ -1425,12 +1419,12 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2] + ix2 = x2_indices[i2 % len_x2_indices] d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1] + ix1 = x1_indices[i1 % len_x1_indices] d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 @@ -1579,12 +1573,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2] + ix2 = x2_indices[i2 % len_x2_indices] self.vec[ix2] = - x2_data[i2] i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1] + ix1 = x1_indices[i1 % len_x1_indices] self.vec[ix1] = x1_data[i1] i1 = i1 + 1 diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index c97114117c72f..ec46db8fc1596 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -3,8 +3,6 @@ import copy import numpy as np -from sklearn.utils._testing import assert_allclose - import pytest import scipy.sparse as sp @@ -18,7 +16,7 @@ ) from sklearn.utils import check_random_state, _IS_32BIT -from sklearn.utils._testing import create_memmap_backed_data +from sklearn.utils._testing import assert_allclose, create_memmap_backed_data from sklearn.utils.fixes import sp_version, parse_version From a5eb20dfe1de36fed91ddf398c9760224c5661f3 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 16:45:11 +0200 Subject: [PATCH 14/68] FIX Make Jaccard, Hamming and Hashing robust to explicit zeros Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 32 +++++++++++++++------- sklearn/metrics/tests/test_dist_metrics.py | 9 +++--- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 5ea2a836b756b..490da7d9837c0 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -406,7 +406,12 @@ cdef class DistanceMetric{{name_suffix}}: This must be overridden in a base class. - Note that we pass all the parameter as to not use memoryview slicing + Notes + ----- + The implementation of this method in subclasses must be robust to the + presence of explicit zeros in the CSR representation. + + All the parameters are passed as to not use memoryview slicing because it is currently known to slow down execution as it takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 """ @@ -432,7 +437,12 @@ cdef class DistanceMetric{{name_suffix}}: rank-preserving surrogate distance of the Euclidean metric is the squared-euclidean distance. - Note that we pass all the parameter as to not use memoryview slicing + Notes + ----- + The implementation of this method in subclasses must be robust to the + presence of explicit zeros in the CSR representation. + + All the parameters are passed as to not use memoryview slicing because it is currently known to slow down execution as it takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 """ @@ -1671,19 +1681,19 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d += 1 + d += (x1_data[i1] != 0) i1 = i1 + 1 else: - d += 1 + d += (x2_data[i2] != 0) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d += 1 + d += (x2_data[i2] != 0) i2 = i2 + 1 else: while i1 < x1_end: - d += 1 + d += (x1_data[i1] != 0) i1 = i1 + 1 d /= size @@ -1925,10 +1935,12 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: + tf2 = x2_data[i2] != 0 nnz += tf2 i2 = i2 + 1 else: while i1 < x1_end: + tf1 = x1_data[i1] != 0 nnz += tf1 i1 = i1 + 1 @@ -1999,19 +2011,19 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += (x1_data[i1] != 0) i1 = i1 + 1 else: - n_neq += 1 + n_neq += (x2_data[i2] != 0) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + n_neq += (x2_data[i2] != 0) i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + n_neq += (x1_data[i1] != 0) i1 = i1 + 1 return n_neq * 1.0 / size diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index ec46db8fc1596..07e248431c253 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -144,12 +144,11 @@ def test_cdist_bool_metric(metric, X_bool, Y_bool): D_sklearn = dm.pairwise(X_bool_csr, Y_bool_csr) assert_allclose(D_sklearn, D_scipy_cdist) - # TODO: inspect failures on Boolean DistanceMetric - # D_sklearn = dm.pairwise(X_bool, Y_bool_csr) - # assert_allclose(D_sklearn, D_scipy_cdist) + D_sklearn = dm.pairwise(X_bool, Y_bool_csr) + assert_allclose(D_sklearn, D_scipy_cdist) - # D_sklearn = dm.pairwise(X_bool_csr, Y_bool) - # assert_allclose(D_sklearn, D_scipy_cdist) + D_sklearn = dm.pairwise(X_bool_csr, Y_bool) + assert_allclose(D_sklearn, D_scipy_cdist) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed From 19edf11a1d0b001dc00b9e70abedac19836daee4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 16:58:14 +0200 Subject: [PATCH 15/68] FIX Make the other boolean DistanceMetric also robust to explicit zeros Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 50 +++++++++++++++++----------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 490da7d9837c0..efdf02d4b34ba 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -2092,19 +2092,21 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += tf1 i1 = i1 + 1 else: - n_neq += 1 + n_neq += tf2 i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + tf2 = x2_data[i2] != 0 + n_neq += tf2 i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + tf1 = x1_data[i1] != 0 + n_neq += tf1 i1 = i1 + 1 return n_neq / (2.0 * n_tt + n_neq) @@ -2174,19 +2176,21 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += tf1 i1 = i1 + 1 else: - n_neq += 1 + n_neq += tf2 i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + tf2 = x2_data[i2] != 0 + n_neq += tf2 i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + tf1 = x1_data[i1] != 0 + n_neq += tf1 i1 = i1 + 1 return (n_neq - n_tt + size) * 1.0 / (n_neq + size) @@ -2252,19 +2256,21 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += tf1 i1 = i1 + 1 else: - n_neq += 1 + n_neq += tf2 i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + tf2 = x2_data[i2] != 0 + n_neq += tf2 i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + tf1 = x1_data[i1] != 0 + n_neq += tf1 i1 = i1 + 1 return (2.0 * n_neq) / (size + n_neq) @@ -2403,19 +2409,21 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += tf1 i1 = i1 + 1 else: - n_neq += 1 + n_neq += tf2 i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + tf2 = x2_data[i2] != 0 + n_neq += tf2 i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + tf1 = x1_data[i1] != 0 + n_neq += tf1 i1 = i1 + 1 return (2.0 * n_neq) / (size + n_neq) @@ -2483,19 +2491,21 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += tf1 i1 = i1 + 1 else: - n_neq += 1 + n_neq += tf2 i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + tf2 = x2_data[i2] != 0 + n_neq += tf2 i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + tf1 = x1_data[i1] != 0 + n_neq += tf1 i1 = i1 + 1 return n_neq / (0.5 * n_tt + n_neq) From de8680278e94d8090024c9a85b79e720e24e04cd Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 17:01:20 +0200 Subject: [PATCH 16/68] TST Remove xfail for Jaccard on 32bit arch. --- sklearn/metrics/tests/test_dist_metrics.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 07e248431c253..68b4f36a7d1c9 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -15,7 +15,7 @@ DistanceMetric32, ) -from sklearn.utils import check_random_state, _IS_32BIT +from sklearn.utils import check_random_state from sklearn.utils._testing import assert_allclose, create_memmap_backed_data from sklearn.utils.fixes import sp_version, parse_version @@ -127,9 +127,6 @@ def test_cdist(metric_param_grid, X, Y): def test_cdist_bool_metric(metric, X_bool, Y_bool): D_scipy_cdist = cdist(X_bool, Y_bool, metric) - if metric == "jaccard" and _IS_32BIT: - pytest.xfail("Jaccard Distance on 32bit architecture is unstable.") - dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool, Y_bool) assert_allclose(D_sklearn, D_scipy_cdist) From bb920cfe6bbf21d9cdfbd944df547e1406594855 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 18:18:27 +0200 Subject: [PATCH 17/68] Cast to np.float64_t where appropriate --- sklearn/metrics/_dist_metrics.pxd.tp | 2 +- sklearn/metrics/_dist_metrics.pyx.tp | 127 +++++++++++++++------------ 2 files changed, 70 insertions(+), 59 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index af3b84cac9d4a..8ae0190e6c478 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -78,7 +78,7 @@ cdef class DistanceMetric{{name_suffix}}: # we must define them here so that cython's limited polymorphism will work. # Because we don't expect to instantiate a lot of these objects, the # extra memory overhead of this setup should not be an issue. - cdef {{INPUT_DTYPE_t}} p + cdef DTYPE_t p cdef DTYPE_t[::1] vec cdef DTYPE_t[:, ::1] mat cdef ITYPE_t size diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index efdf02d4b34ba..094ef38cb240d 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -588,14 +588,12 @@ cdef class DistanceMetric{{name_suffix}}: Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Xarr) if X is Y: - Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), - dtype=DTYPE, order='C') + Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype=DTYPE, order='C') self.pdist(Xarr, Darr) else: Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Yarr) - Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), - dtype=DTYPE, order='C') + Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype=DTYPE, order='C') self.cdist(Xarr, Yarr, Darr) return Darr @@ -777,7 +775,7 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - unsquared = (x1_data[i1] - x2_data[i2]) + unsquared = ((x1_data[i1]) - (x2_data[i2])) d = d + (unsquared * unsquared) i1 = i1 + 1 i2 = i2 + 1 @@ -852,8 +850,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t tmp, d=0 cdef cnp.intp_t j for j in range(size): - tmp = (x1[j] - x2[j]) - d += (tmp * tmp / self.vec[j]) + tmp = ((x1[j]) - (x2[j])) + d += (tmp * tmp / self.vec[j]) return d cdef inline DTYPE_t dist(self, @@ -905,7 +903,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - unsquared = (x1_data[i1] - x2_data[i2]) + unsquared = ((x1_data[i1]) - (x2_data[i2])) d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 i2 = i2 + 1 @@ -975,7 +973,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d += fabs(x1[j] - x2[j]) + d += fabs((x1[j]) - (x2[j])) return d cdef inline DTYPE_t csr_dist(self, @@ -1007,23 +1005,23 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = d + fabs(x1_data[i1] - x2_data[i2]) + d = d + fabs((x1_data[i1]) - (x2_data[i2])) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + fabs(x1_data[i1]) + d = d + fabs(x1_data[i1]) i1 = i1 + 1 else: - d = d + fabs(x2_data[i2]) + d = d + fabs(x2_data[i2]) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = d + fabs(x2_data[i2]) + d = d + fabs(x2_data[i2]) i2 = i2 + 1 else: while i1 < x1_end: - d = d + fabs(x1_data[i1]) + d = d + fabs(x1_data[i1]) i1 = i1 + 1 return d @@ -1061,7 +1059,7 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d = fmax(d, fabs(x1[j] - x2[j])) + d = fmax(d, fabs( (x1[j]) - (x2[j]))) return d @@ -1094,23 +1092,23 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) + d = fmax(d, fabs((x1_data[i1]) - (x2_data[i2]))) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = fmax(d, fabs(x1_data[i1])) + d = fmax(d, fabs(x1_data[i1])) i1 = i1 + 1 else: - d = fmax(d, fabs(x2_data[i2])) + d = fmax(d, fabs(x2_data[i2])) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = fmax(d, fabs(x2_data[i2])) + d = fmax(d, fabs(x2_data[i2])) i2 = i2 + 1 else: while i1 < x1_end: - d = fmax(d, fabs(x1_data[i1])) + d = fmax(d, fabs(x1_data[i1])) i1 = i1 + 1 return d @@ -1179,10 +1177,12 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef bint has_w = self.size > 0 if has_w: for j in range(size): - d += (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p)) + d += (self.vec[j] * pow(fabs( + (x1[j]) - (x2[j]) + ), self.p)) else: for j in range(size): - d += (pow(fabs(x1[j] - x2[j]), self.p)) + d += (pow(fabs((x1[j]) - (x2[j])), self.p)) return d cdef inline DTYPE_t dist(self, @@ -1235,25 +1235,27 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = d + (self.vec[ix1] * pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) + d = d + (self.vec[ix1] * pow(fabs( + (x1_data[i1]) - (x2_data[i2]) + ), self.p)) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 else: - d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: ix2 = x2_indices[i2 % len_x2_indices] - d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: ix1 = x1_indices[i1 % len_x1_indices] - d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d @@ -1266,23 +1268,25 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = d + (pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) + d = d + (pow(fabs( + (x1_data[i1]) - (x2_data[i2]) + ), self.p)) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + (pow(fabs(x1_data[i1]), self.p)) + d = d + (pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 else: - d = d + (pow(fabs(x2_data[i2]), self.p)) + d = d + (pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = d + (pow(fabs(x2_data[i2]), self.p)) + d = d + (pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: - d = d + (pow(fabs(x1_data[i1]), self.p)) + d = d + (pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d @@ -1366,7 +1370,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d += (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p)) + d += (pow(self.vec[j] * fabs( + (x1[j]) - (x2[j]) + ), self.p)) return d cdef inline DTYPE_t dist(self, @@ -1417,25 +1423,27 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = d + pow(self.vec[ix1] * fabs(x1_data[i1] - x2_data[i2]), self.p) + d = d + pow(self.vec[ix1] * fabs( + (x1_data[i1]) - (x2_data[i2]) + ), self.p) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 else: - d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: ix2 = x2_indices[i2 % len_x2_indices] - d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 else: while i1 < x1_end: ix1 = x1_indices[i1 % len_x1_indices] - d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 return d @@ -1514,7 +1522,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): # compute (x1 - x2).T * VI * (x1 - x2) for i in range(size): - self.vec[i] = x1[i] - x2[i] + self.vec[i] = (x1[i]) - (x2[i]) for i in range(size): tmp = 0 @@ -1571,7 +1579,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - self.vec[ix1] = x1_data[i1] - x2_data[i2] + self.vec[ix1] = (x1_data[i1]) - (x2_data[i2]) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: @@ -1721,9 +1729,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t denom, d = 0 cdef cnp.intp_t j for j in range(size): - denom = (fabs(x1[j]) + fabs(x2[j])) + denom = fabs((x1[j])) + fabs((x2[j])) if denom > 0: - d += (fabs(x1[j] - x2[j])) / denom + d += fabs((x1[j]) - (x2[j])) / denom return d cdef inline DTYPE_t csr_dist(self, @@ -1755,7 +1763,10 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d += fabs(x1_data[i1] - x2_data[i2]) / (fabs(x1_data[i1]) + fabs(x2_data[i2])) + d += ( + fabs((x1_data[i1]) - (x2_data[i2])) / + (fabs((x1_data[i1])) + fabs((x2_data[i2]))) + ) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: @@ -1796,8 +1807,8 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t num = 0, denom = 0 cdef cnp.intp_t j for j in range(size): - num += fabs(x1[j] - x2[j]) - denom += (fabs(x1[j]) + fabs(x2[j])) + num += fabs((x1[j]) - (x2[j])) + denom += (fabs(x1[j]) + fabs(x2[j])) if denom > 0: return num / denom else: @@ -1833,28 +1844,28 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - num += fabs(x1_data[i1] - x2_data[i2]) - denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) + num += fabs((x1_data[i1]) - (x2_data[i2])) + denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - num += fabs(x1_data[i1]) - denom += fabs(x1_data[i1]) + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) i1 = i1 + 1 else: - num += fabs(x2_data[i2]) - denom += fabs(x2_data[i2]) + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - num += fabs(x1_data[i1]) - denom += fabs(x1_data[i1]) + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) i2 = i2 + 1 else: while i1 < x1_end: - num += fabs(x2_data[i2]) - denom += fabs(x2_data[i2]) + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) i1 = i1 + 1 return num / denom @@ -2537,9 +2548,9 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: - cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) - cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) - return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) + cdef DTYPE_t sin_0 = sin(0.5 * ((x1[0]) - (x2[0]))) + cdef DTYPE_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1]))) + return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) cdef inline DTYPE_t dist(self, const {{INPUT_DTYPE_t}}* x1, From b3759fead28f064e519701786e0d0fd9522437bf Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 20 Jun 2022 12:34:23 +0200 Subject: [PATCH 18/68] Rename methods and correctly format their signatures Co-authored-by: Christian Lorentzen --- sklearn/metrics/_dist_metrics.pxd.tp | 24 ++- sklearn/metrics/_dist_metrics.pyx.tp | 189 ++++++++++++++------- sklearn/metrics/tests/test_dist_metrics.py | 4 +- 3 files changed, 141 insertions(+), 76 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 8ae0190e6c478..a12c089b089fc 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -85,19 +85,22 @@ cdef class DistanceMetric{{name_suffix}}: cdef object func cdef object kwargs - cdef DTYPE_t dist(self, + cdef DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1 - cdef DTYPE_t rdist(self, + cdef DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1 - cdef DTYPE_t csr_dist(self, + cdef DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -109,7 +112,8 @@ cdef class DistanceMetric{{name_suffix}}: const ITYPE_t size, ) nogil except -1 - cdef DTYPE_t csr_rdist(self, + cdef DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -121,18 +125,21 @@ cdef class DistanceMetric{{name_suffix}}: const ITYPE_t size, ) nogil except -1 - cdef int pdist(self, + cdef int pdist( + self, const {{INPUT_DTYPE_t}}[:, ::1] X, DTYPE_t[:, ::1] D, ) except -1 - cdef int cdist(self, + cdef int cdist( + self, const {{INPUT_DTYPE_t}}[:, ::1] X, const {{INPUT_DTYPE_t}}[:, ::1] Y, DTYPE_t[:, ::1] D, ) except -1 - cdef int csr_pdist(self, + cdef int pdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, @@ -140,7 +147,8 @@ cdef class DistanceMetric{{name_suffix}}: DTYPE_t[:, ::1] D, ) nogil except -1 - cdef int csr_cdist(self, + cdef int cdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 094ef38cb240d..e83254dec8895 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -336,7 +336,8 @@ cdef class DistanceMetric{{name_suffix}}: """ return - cdef DTYPE_t dist(self, + cdef DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -347,7 +348,8 @@ cdef class DistanceMetric{{name_suffix}}: """ return -999 - cdef DTYPE_t rdist(self, + cdef DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -363,7 +365,8 @@ cdef class DistanceMetric{{name_suffix}}: """ return self.dist(x1, x2, size) - cdef int pdist(self, + cdef int pdist( + self, const {{INPUT_DTYPE_t}}[:, ::1] X, DTYPE_t[:, ::1] D, ) except -1: @@ -376,7 +379,8 @@ cdef class DistanceMetric{{name_suffix}}: return 0 - cdef int cdist(self, + cdef int cdist( + self, const {{INPUT_DTYPE_t}}[:, ::1] X, const {{INPUT_DTYPE_t}}[:, ::1] Y, DTYPE_t[:, ::1] D, @@ -390,7 +394,8 @@ cdef class DistanceMetric{{name_suffix}}: D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) return 0 - cdef DTYPE_t csr_dist(self, + cdef DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -417,7 +422,8 @@ cdef class DistanceMetric{{name_suffix}}: """ return -999 - cdef DTYPE_t csr_rdist(self, + cdef DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -446,7 +452,7 @@ cdef class DistanceMetric{{name_suffix}}: because it is currently known to slow down execution as it takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 """ - return self.csr_dist( + return self.dist_csr( x1_data, x1_indices, x2_data, @@ -458,7 +464,8 @@ cdef class DistanceMetric{{name_suffix}}: size, ) - cdef int csr_pdist(self, + cdef int pdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, @@ -467,7 +474,7 @@ cdef class DistanceMetric{{name_suffix}}: ) nogil except -1: """Pairwise distances between rows in CSR matrix X. - Note that this implementation is twice faster than csr_cdist(X, X) + Note that this implementation is twice faster than cdist_csr(X, X) because it leverages the symmetry of the problem. """ cdef: @@ -481,7 +488,7 @@ cdef class DistanceMetric{{name_suffix}}: for i2 in range(i1, n_x1): x2_start = x1_indptr[i2] x2_end = x1_indptr[i2 + 1] - D[i1, i2] = D[i2, i1] = self.csr_dist( + D[i1, i2] = D[i2, i1] = self.dist_csr( x1_data, x1_indices, x1_data, @@ -494,7 +501,8 @@ cdef class DistanceMetric{{name_suffix}}: ) return 0 - cdef int csr_cdist(self, + cdef int cdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, @@ -519,7 +527,7 @@ cdef class DistanceMetric{{name_suffix}}: x2_start = x2_indptr[i2] x2_end = x2_indptr[i2 + 1] - D[i1, i2] = self.csr_dist( + D[i1, i2] = self.dist_csr( x1_data, x1_indices, x2_data, @@ -606,7 +614,7 @@ cdef class DistanceMetric{{name_suffix}}: if X is Y: Darr = np.empty((n_X, n_X), dtype=DTYPE, order='C') - self.csr_pdist( + self.pdist_csr( x1_data=X_data, x1_indices=X_indices, x1_indptr=X_indptr, @@ -621,7 +629,7 @@ cdef class DistanceMetric{{name_suffix}}: Y_indptr = np.asarray(Y_csr.indptr, dtype=SPARSE_INDEX_TYPE) Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') - self.csr_cdist( + self.cdist_csr( x1_data=X_data, x1_indices=X_indices, x1_indptr=X_indptr, @@ -659,7 +667,7 @@ cdef class DistanceMetric{{name_suffix}}: ) Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') - self.csr_cdist( + self.cdist_csr( x1_data=X_data, x1_indices=X_indices, x1_indptr=X_indptr, @@ -745,7 +753,8 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline DTYPE_t csr_rdist(self, + cdef inline DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -801,7 +810,8 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -812,7 +822,8 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: - return sqrt(self.csr_rdist( + return sqrt( + self.rdist_csr( x1_data, x1_indices, x2_data, @@ -842,7 +853,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('SEuclidean dist: size of V does not match') - cdef inline DTYPE_t rdist(self, + cdef inline DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -854,7 +866,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (tmp * tmp / self.vec[j]) return d - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -873,7 +886,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline DTYPE_t csr_rdist(self, + cdef inline DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -930,7 +944,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -941,7 +956,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: - return sqrt(self.csr_rdist( + return sqrt( + self.rdist_csr( x1_data, x1_indices, x2_data, @@ -965,7 +981,8 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = 1 - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -976,7 +993,8 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += fabs((x1[j]) - (x2[j])) return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1051,7 +1069,8 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = INF{{name_suffix}} - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1063,7 +1082,8 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1167,7 +1187,8 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): f"the number of features ({X.shape[1]}). " f"Currently len(w)={self.size}.") - cdef inline DTYPE_t rdist(self, + cdef inline DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1185,7 +1206,8 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (pow(fabs((x1[j]) - (x2[j])), self.p)) return d - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1204,7 +1226,8 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** self.p - cdef inline DTYPE_t csr_rdist(self, + cdef inline DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1291,7 +1314,8 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1303,7 +1327,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const ITYPE_t size, ) nogil except -1: return pow( - self.csr_rdist( + self.rdist_csr( x1_data, x1_indices, x2_data, @@ -1361,7 +1385,8 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise ValueError('WMinkowskiDistance dist: ' 'size of w does not match') - cdef inline DTYPE_t rdist(self, + cdef inline DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1375,7 +1400,8 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ), self.p)) return d - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1394,7 +1420,8 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** self.p - cdef inline DTYPE_t csr_rdist(self, + cdef inline DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1448,7 +1475,8 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1460,7 +1488,7 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const ITYPE_t size, ) nogil except -1: return pow( - self.csr_rdist( + self.rdist_csr( x1_data, x1_indices, x2_data, @@ -1512,7 +1540,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('Mahalanobis dist: size of V does not match') - cdef inline DTYPE_t rdist(self, + cdef inline DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1531,7 +1560,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += tmp * self.vec[i] return d - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1550,7 +1580,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline DTYPE_t csr_rdist(self, + cdef inline DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1608,7 +1639,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1619,7 +1651,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: - return sqrt(self.csr_rdist( + return sqrt( + self.rdist_csr( x1_data, x1_indices, x2_data, @@ -1643,7 +1676,8 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1656,7 +1690,8 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return float(n_unequal) / size - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1721,7 +1756,8 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1734,7 +1770,8 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += fabs((x1[j]) - (x2[j])) / denom return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1799,7 +1836,8 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1814,7 +1852,8 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): else: return 0.0 - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1882,7 +1921,8 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1901,7 +1941,8 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return 0 return (nnz - n_eq) * 1.0 / nnz - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1974,7 +2015,8 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / N """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1987,7 +2029,8 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return n_neq * 1. / size - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2052,7 +2095,8 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2066,7 +2110,8 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return n_neq / (2.0 * n_tt + n_neq) - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2136,7 +2181,8 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 1 - N_TT / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2150,7 +2196,8 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return (n_neq - n_tt + size) * 1.0 / (n_neq + size) - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2218,7 +2265,8 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2231,7 +2279,8 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2298,7 +2347,8 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N - N_TT) / N """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2311,7 +2361,8 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return (size - n_tt) * 1. / size - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2371,7 +2422,8 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2384,7 +2436,8 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2451,7 +2504,8 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT / 2 + N_FT + N_TF) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2465,7 +2519,8 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return n_neq / (0.5 * n_tt + n_neq) - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2595,14 +2650,16 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The # only way to be back compatible is to inherit `dist` from the base class # without GIL and called an inline `_dist` which acquire GIL. - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: return self._dist(x1, x2, size) - cdef inline DTYPE_t _dist(self, + cdef inline DTYPE_t _dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 68b4f36a7d1c9..714f200397532 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -105,8 +105,8 @@ def test_cdist(metric_param_grid, X, Y): dm = DistanceMetricInterface.get_metric(metric, **kwargs) - # DistanceMetric.pairwise must be consistent - # on all combinations of format in {sparse, dense}². + # DistanceMetric.pairwise must be consistent for all + # combinations of formats in {sparse, dense}. D_sklearn = dm.pairwise(X, Y) assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) From 7f89236a58604597f08281eecd4013f9a6798aeb Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 20 Jun 2022 12:44:46 +0200 Subject: [PATCH 19/68] fixup! TST Remove xfail for Jaccard on 32bit arch. --- sklearn/metrics/_dist_metrics.pyx.tp | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index e83254dec8895..f36a0701c32f9 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -1198,12 +1198,12 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef bint has_w = self.size > 0 if has_w: for j in range(size): - d += (self.vec[j] * pow(fabs( + d += (self.vec[j] * pow(fabs( (x1[j]) - (x2[j]) ), self.p)) else: for j in range(size): - d += (pow(fabs((x1[j]) - (x2[j])), self.p)) + d += (pow(fabs((x1[j]) - (x2[j])), self.p)) return d cdef inline DTYPE_t dist( @@ -1212,13 +1212,13 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: - return pow(self.rdist(x1, x2, size), 1. / self.p) + return pow(self.rdist(x1, x2, size), 1. / self.p) cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: - return pow(rdist, 1. / self.p) + return pow( rdist, 1. / self.p) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - return pow(dist, self.p) + return pow( dist, self.p) def rdist_to_dist(self, rdist): return rdist ** (1. / self.p) @@ -1395,9 +1395,7 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d += (pow(self.vec[j] * fabs( - (x1[j]) - (x2[j]) - ), self.p)) + d += (pow(self.vec[j] * fabs((x1[j]) - (x2[j])), self.p)) return d cdef inline DTYPE_t dist( @@ -1406,13 +1404,13 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: - return pow(self.rdist(x1, x2, size), 1. / self.p) + return pow(self.rdist(x1, x2, size), 1. / self.p) cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: - return pow(rdist, 1. / self.p) + return pow( rdist, 1. / self.p) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - return pow(dist, self.p) + return pow( dist, self.p) def rdist_to_dist(self, rdist): return rdist ** (1. / self.p) @@ -1846,7 +1844,7 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef cnp.intp_t j for j in range(size): num += fabs((x1[j]) - (x2[j])) - denom += (fabs(x1[j]) + fabs(x2[j])) + denom += (fabs(x1[j]) + fabs(x2[j])) if denom > 0: return num / denom else: @@ -2618,7 +2616,7 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return 2 * asin(sqrt(rdist)) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - cdef DTYPE_t tmp = sin(0.5 * dist) + cdef DTYPE_t tmp = sin(0.5 * dist) return tmp * tmp def rdist_to_dist(self, rdist): From 01a0c3311cc566ba83855ac0743bcc4c2ae56633 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 20 Jun 2022 15:42:31 +0200 Subject: [PATCH 20/68] FEA CSR support for HaversineDistance --- sklearn/metrics/_dist_metrics.pyx.tp | 99 ++++++++++++++++++++++ sklearn/metrics/tests/test_dist_metrics.py | 43 +++++++--- 2 files changed, 132 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index f36a0701c32f9..1fd5dc7dfdeed 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -2626,6 +2626,105 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): tmp = np.sin(0.5 * dist) return tmp * tmp + cdef inline DTYPE_t dist_csr( + self, + const {{INPUT_DTYPE_t}}[:] x1_data, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const {{INPUT_DTYPE_t}}[:] x2_data, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return 2 * asin(sqrt(self.rdist_csr( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ))) + + cdef inline DTYPE_t rdist_csr( + self, + const {{INPUT_DTYPE_t}}[:] x1_data, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const {{INPUT_DTYPE_t}}[:] x2_data, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] + + DTYPE_t x1_0 = 0 + DTYPE_t x1_1 = 0 + DTYPE_t x2_0 = 0 + DTYPE_t x2_1 = 0 + DTYPE_t sin_0 + DTYPE_t sin_1 + + while i1 < x1_end and i2 < x2_end: + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] + + # Find the components in the 2D vectors to work with + x1_component = ix1 if (x1_start == 0) else ix1 % x1_start + x2_component = ix2 if (x2_start == 0) else ix2 % x2_start + + if x1_component == 0: + x1_0 = x1_data[i1] + else: + x1_1 = x1_data[i1] + + if x2_component == 0: + x2_0 = x2_data[i2] + else: + x2_1 = x2_data[i2] + + i1 = i1 + 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2 % len_x2_indices] + x2_component = ix2 if (x2_start == 0) else ix2 % x2_start + if x2_component == 0: + x2_0 = x2_data[i2] + else: + x2_1 = x2_data[i2] + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1 % len_x1_indices] + x1_component = ix1 if (x1_start == 0) else ix1 % x1_start + if x1_component == 0: + x1_0 = x1_data[i1] + else: + x1_1 = x1_data[i1] + i1 = i1 + 1 + + sin_0 = sin(0.5 * (x1_0 - x2_0)) + sin_1 = sin(0.5 * (x1_1 - x2_1)) + + return (sin_0 * sin_0 + cos(x1_0) * cos(x2_0) * sin_1 * sin_1) + #------------------------------------------------------------ # User-defined distance # diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 714f200397532..eb4cd384a966f 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -266,7 +266,20 @@ def test_pickle_bool_metrics(metric, X_bool): assert_allclose(D1, D2) -def test_haversine_metric(): +@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)]) +def test_haversine_metric(X, Y): + DistanceMetricInterface = ( + DistanceMetric if X.dtype == np.float64 else DistanceMetric32 + ) + + # The Haversine DistanceMetric only works on 2 features. + X = np.asarray(X[:, :2]) + Y = np.asarray(Y[:, :2]) + + X_csr, Y_csr = sp.csr_matrix(X), sp.csr_matrix(Y) + + # Haversine is not supported by scipy.special.distance.{cdist,pdist} + # So we reimplement it to have a reference. def haversine_slow(x1, x2): return 2 * np.arcsin( np.sqrt( @@ -275,18 +288,28 @@ def haversine_slow(x1, x2): ) ) - X = np.random.random((10, 2)) + D_reference = np.zeros((X_csr.shape[0], Y_csr.shape[0])) + for i, xi in enumerate(X): + for j, yj in enumerate(Y): + D_reference[i, j] = haversine_slow(xi, yj) - haversine = DistanceMetric.get_metric("haversine") + haversine = DistanceMetricInterface.get_metric("haversine") - D1 = haversine.pairwise(X) - D2 = np.zeros_like(D1) - for i, x1 in enumerate(X): - for j, x2 in enumerate(X): - D2[i, j] = haversine_slow(x1, x2) + D_sklearn = haversine.pairwise(X, Y) + assert_allclose( + haversine.dist_to_rdist(D_sklearn), np.sin(0.5 * D_reference) ** 2, rtol=1e-6 + ) - assert_allclose(D1, D2) - assert_allclose(haversine.dist_to_rdist(D1), np.sin(0.5 * D2) ** 2) + assert_allclose(D_sklearn, D_reference) + + D_sklearn = haversine.pairwise(X_csr, Y_csr) + assert_allclose(D_sklearn, D_reference) + + D_sklearn = haversine.pairwise(X_csr, Y) + assert_allclose(D_sklearn, D_reference) + + D_sklearn = haversine.pairwise(X, Y_csr) + assert_allclose(D_sklearn, D_reference) def test_pyfunc_metric(): From 7d8a7173c03ef11dac9788de5682c6b4b4efef6e Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 22 Jun 2022 10:44:29 +0200 Subject: [PATCH 21/68] Fix typo Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 1fd5dc7dfdeed..4670f98030e0a 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -409,7 +409,7 @@ cdef class DistanceMetric{{name_suffix}}: """Compute the distance between vectors x1 and x2 represented under the CSR format. - This must be overridden in a base class. + This must be overridden in a subclass. Notes ----- From 563e3590c4bb5b3e3a6de4f42bc2e6f56a4769dc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 22 Jun 2022 11:54:40 +0200 Subject: [PATCH 22/68] Do not upcast to 64bit yet keep the same precision --- sklearn/metrics/_dist_metrics.pyx.tp | 120 ++++++++++----------- sklearn/metrics/tests/test_dist_metrics.py | 8 +- 2 files changed, 65 insertions(+), 63 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 4670f98030e0a..2784a767cf0c9 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -784,7 +784,7 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - unsquared = ((x1_data[i1]) - (x2_data[i2])) + unsquared = x1_data[i1] - x2_data[i2] d = d + (unsquared * unsquared) i1 = i1 + 1 i2 = i2 + 1 @@ -862,7 +862,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t tmp, d=0 cdef cnp.intp_t j for j in range(size): - tmp = ((x1[j]) - (x2[j])) + tmp = x1[j] - x2[j] d += (tmp * tmp / self.vec[j]) return d @@ -917,7 +917,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - unsquared = ((x1_data[i1]) - (x2_data[i2])) + unsquared = x1_data[i1] - x2_data[i2] d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 i2 = i2 + 1 @@ -990,7 +990,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d += fabs((x1[j]) - (x2[j])) + d += fabs(x1[j] - x2[j]) return d cdef inline DTYPE_t dist_csr( @@ -1023,23 +1023,23 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = d + fabs((x1_data[i1]) - (x2_data[i2])) + d = d + fabs(x1_data[i1] - x2_data[i2]) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + fabs(x1_data[i1]) + d = d + fabs(x1_data[i1]) i1 = i1 + 1 else: - d = d + fabs(x2_data[i2]) + d = d + fabs(x2_data[i2]) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = d + fabs(x2_data[i2]) + d = d + fabs(x2_data[i2]) i2 = i2 + 1 else: while i1 < x1_end: - d = d + fabs(x1_data[i1]) + d = d + fabs(x1_data[i1]) i1 = i1 + 1 return d @@ -1078,7 +1078,7 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d = fmax(d, fabs( (x1[j]) - (x2[j]))) + d = fmax(d, fabs(x1[j] - x2[j])) return d @@ -1112,23 +1112,23 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = fmax(d, fabs((x1_data[i1]) - (x2_data[i2]))) + d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = fmax(d, fabs(x1_data[i1])) + d = fmax(d, fabs(x1_data[i1])) i1 = i1 + 1 else: - d = fmax(d, fabs(x2_data[i2])) + d = fmax(d, fabs(x2_data[i2])) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = fmax(d, fabs(x2_data[i2])) + d = fmax(d, fabs(x2_data[i2])) i2 = i2 + 1 else: while i1 < x1_end: - d = fmax(d, fabs(x1_data[i1])) + d = fmax(d, fabs(x1_data[i1])) i1 = i1 + 1 return d @@ -1198,12 +1198,10 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef bint has_w = self.size > 0 if has_w: for j in range(size): - d += (self.vec[j] * pow(fabs( - (x1[j]) - (x2[j]) - ), self.p)) + d += (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p)) else: for j in range(size): - d += (pow(fabs((x1[j]) - (x2[j])), self.p)) + d += (pow(fabs(x1[j] - x2[j]), self.p)) return d cdef inline DTYPE_t dist( @@ -1215,10 +1213,10 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return pow(self.rdist(x1, x2, size), 1. / self.p) cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: - return pow( rdist, 1. / self.p) + return pow(rdist, 1. / self.p) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - return pow( dist, self.p) + return pow(dist, self.p) def rdist_to_dist(self, rdist): return rdist ** (1. / self.p) @@ -1259,26 +1257,26 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if ix1 == ix2: d = d + (self.vec[ix1] * pow(fabs( - (x1_data[i1]) - (x2_data[i2]) + x1_data[i1] - x2_data[i2] ), self.p)) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 else: - d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: ix2 = x2_indices[i2 % len_x2_indices] - d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: ix1 = x1_indices[i1 % len_x1_indices] - d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d @@ -1292,24 +1290,24 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if ix1 == ix2: d = d + (pow(fabs( - (x1_data[i1]) - (x2_data[i2]) + x1_data[i1] - x2_data[i2] ), self.p)) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + (pow(fabs(x1_data[i1]), self.p)) + d = d + (pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 else: - d = d + (pow(fabs(x2_data[i2]), self.p)) + d = d + (pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = d + (pow(fabs(x2_data[i2]), self.p)) + d = d + (pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: - d = d + (pow(fabs(x1_data[i1]), self.p)) + d = d + (pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d @@ -1395,7 +1393,7 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d += (pow(self.vec[j] * fabs((x1[j]) - (x2[j])), self.p)) + d += (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p)) return d cdef inline DTYPE_t dist( @@ -1407,10 +1405,10 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return pow(self.rdist(x1, x2, size), 1. / self.p) cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: - return pow( rdist, 1. / self.p) + return pow(rdist, 1. / self.p) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - return pow( dist, self.p) + return pow(dist, self.p) def rdist_to_dist(self, rdist): return rdist ** (1. / self.p) @@ -1449,26 +1447,26 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if ix1 == ix2: d = d + pow(self.vec[ix1] * fabs( - (x1_data[i1]) - (x2_data[i2]) + x1_data[i1] - x2_data[i2] ), self.p) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 else: - d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: ix2 = x2_indices[i2 % len_x2_indices] - d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 else: while i1 < x1_end: ix1 = x1_indices[i1 % len_x1_indices] - d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 return d @@ -1549,7 +1547,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): # compute (x1 - x2).T * VI * (x1 - x2) for i in range(size): - self.vec[i] = (x1[i]) - (x2[i]) + self.vec[i] = x1[i] - x2[i] for i in range(size): tmp = 0 @@ -1608,7 +1606,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - self.vec[ix1] = (x1_data[i1]) - (x2_data[i2]) + self.vec[ix1] = x1_data[i1] - x2_data[i2] i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: @@ -1763,9 +1761,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t denom, d = 0 cdef cnp.intp_t j for j in range(size): - denom = fabs((x1[j])) + fabs((x2[j])) + denom = fabs(x1[j]) + fabs(x2[j]) if denom > 0: - d += fabs((x1[j]) - (x2[j])) / denom + d += fabs(x1[j] - x2[j]) / denom return d cdef inline DTYPE_t dist_csr( @@ -1799,8 +1797,8 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if ix1 == ix2: d += ( - fabs((x1_data[i1]) - (x2_data[i2])) / - (fabs((x1_data[i1])) + fabs((x2_data[i2]))) + fabs(x1_data[i1] - x2_data[i2]) / + (fabs(x1_data[i1]) + fabs(x2_data[i2])) ) i1 = i1 + 1 i2 = i2 + 1 @@ -1843,8 +1841,8 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t num = 0, denom = 0 cdef cnp.intp_t j for j in range(size): - num += fabs((x1[j]) - (x2[j])) - denom += (fabs(x1[j]) + fabs(x2[j])) + num += fabs(x1[j] - x2[j]) + denom += fabs(x1[j]) + fabs(x2[j]) if denom > 0: return num / denom else: @@ -1881,28 +1879,28 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - num += fabs((x1_data[i1]) - (x2_data[i2])) - denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) + num += fabs(x1_data[i1] - x2_data[i2]) + denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - num += fabs(x1_data[i1]) - denom += fabs(x1_data[i1]) + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) i1 = i1 + 1 else: - num += fabs(x2_data[i2]) - denom += fabs(x2_data[i2]) + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - num += fabs(x1_data[i1]) - denom += fabs(x1_data[i1]) + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) i2 = i2 + 1 else: while i1 < x1_end: - num += fabs(x2_data[i2]) - denom += fabs(x2_data[i2]) + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) i1 = i1 + 1 return num / denom @@ -2601,9 +2599,9 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: - cdef DTYPE_t sin_0 = sin(0.5 * ((x1[0]) - (x2[0]))) - cdef DTYPE_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1]))) - return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) + cdef DTYPE_t sin_0 = sin(0.5 * ((x1[0]) - (x2[0]))) + cdef DTYPE_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1]))) + return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) cdef inline DTYPE_t dist(self, const {{INPUT_DTYPE_t}}* x1, @@ -2616,7 +2614,7 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return 2 * asin(sqrt(rdist)) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - cdef DTYPE_t tmp = sin(0.5 * dist) + cdef DTYPE_t tmp = sin(0.5 * dist) return tmp * tmp def rdist_to_dist(self, rdist): diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index eb4cd384a966f..66c7802a33f24 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -209,11 +209,15 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): D64 = dm64.pairwise(X64) D32 = dm32.pairwise(X32) - assert_allclose(D64, D32, rtol=1e-5) + + # The original rtol is 1e-4 in of presence of float32. + # We increase the tolerance to 1e-6 to be stricter. + rtol = 1e-6 + assert_allclose(D64, D32, rtol=rtol) D64 = dm64.pairwise(X64, Y64) D32 = dm32.pairwise(X32, Y32) - assert_allclose(D64, D32, rtol=1e-5) + assert_allclose(D64, D32, rtol=rtol) @pytest.mark.parametrize("metric", BOOL_METRICS) From f863a517307c6d4ab695e8fbf0d14a01a84cc2f8 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 22 Jun 2022 13:29:11 +0200 Subject: [PATCH 23/68] Do use the default rtol --- sklearn/metrics/tests/test_dist_metrics.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 66c7802a33f24..3a96ffb6909ef 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -209,15 +209,11 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): D64 = dm64.pairwise(X64) D32 = dm32.pairwise(X32) - - # The original rtol is 1e-4 in of presence of float32. - # We increase the tolerance to 1e-6 to be stricter. - rtol = 1e-6 - assert_allclose(D64, D32, rtol=rtol) + assert_allclose(D64, D32) D64 = dm64.pairwise(X64, Y64) D32 = dm32.pairwise(X32, Y32) - assert_allclose(D64, D32, rtol=rtol) + assert_allclose(D64, D32) @pytest.mark.parametrize("metric", BOOL_METRICS) From 5ba0fbeea2a5c12e8f888bc20b37a0403d0ab40b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 22 Jun 2022 13:52:42 +0200 Subject: [PATCH 24/68] Set rtol explicitly in test_distance_metrics_dtype_consistency --- sklearn/metrics/tests/test_dist_metrics.py | 23 ++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 3a96ffb6909ef..ebd5f702eac9b 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -198,10 +198,15 @@ def test_pdist(metric_param_grid, X): @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) def test_distance_metrics_dtype_consistency(metric_param_grid): - # DistanceMetric must return similar distances for - # both 64bit and 32bit data. + # DistanceMetric must return similar distances for both float32 and float64 + # input data. metric, param_grid = metric_param_grid keys = param_grid.keys() + + # Chose rtol to make sure that this test is robust to changes in the random + # seed in the module-level test data generation code. + rtol = 1e-5 + for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) dm64 = DistanceMetric.get_metric(metric, **kwargs) @@ -209,11 +214,21 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): D64 = dm64.pairwise(X64) D32 = dm32.pairwise(X32) - assert_allclose(D64, D32) + + # Both results are np.float64 dtype because the accumulation accross + # features is done in float64. However the input data and the element + # wise arithmetic operations are done in float32 so we can expect a + # small discrepancy. + assert D64.dtype == D32.dtype == np.float64 + + # assert_allclose introspects the dtype of the input arrays to decide + # which rtol value to use by default but in this case we know that D32 + # is not computed with the same precision so we set rtol manually. + assert_allclose(D64, D32, rtol=rtol) D64 = dm64.pairwise(X64, Y64) D32 = dm32.pairwise(X32, Y32) - assert_allclose(D64, D32) + assert_allclose(D64, D32, rtol=rtol) @pytest.mark.parametrize("metric", BOOL_METRICS) From 4f4583934c151051510ba7cead1256bb6a7dabe6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Jun 2022 11:32:58 +0200 Subject: [PATCH 25/68] Implement the sparse-dense and the dense-sparse case for c-contiguity Also do test for c-contiguity. --- sklearn/metrics/_dist_metrics.pyx.tp | 37 ++++++++++++++++++++-- sklearn/metrics/tests/test_dist_metrics.py | 16 +++++++++- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 2784a767cf0c9..a1ff483d6c702 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -679,6 +679,39 @@ cdef class DistanceMetric{{name_suffix}}: ) return Darr + def _pairwise_dense_sparse(self, X, Y): + # Same remark as in _pairwise_sparse_dense. We could + # have implemented this method using _pairwise_dense_sparse, + # but this would have come with an extra copy to ensure + # c-contiguity of the result. + + Y_csr = Y.tocsr() + n_Y, size = Y_csr.shape + Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}}) + Y_indices = np.asarray(Y_csr.indices, dtype=SPARSE_INDEX_TYPE) + Y_indptr = np.asarray(Y_csr.indptr, dtype=SPARSE_INDEX_TYPE) + + n_X, _ = X.shape + X_data = X.reshape(-1) + X_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.arange( + start=0, stop=size * (n_X + 1), step=size, dtype=SPARSE_INDEX_TYPE + ) + + Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') + self.cdist_csr( + x1_data=X_data, + x1_indices=X_indices, + x1_indptr=X_indptr, + x2_data=Y_data, + x2_indices=Y_indices, + x2_indptr=Y_indptr, + size=size, + D=Darr, + ) + return Darr + + def pairwise(self, X, Y=None): """Compute the pairwise distances between X and Y @@ -711,9 +744,7 @@ cdef class DistanceMetric{{name_suffix}}: return self._pairwise_sparse_sparse(X, Y) if X_is_sparse and not Y_is_sparse: return self._pairwise_sparse_dense(X, Y) - if not X_is_sparse and Y_is_sparse: - # Swapping argument and transposing the result - return self._pairwise_sparse_dense(Y, X).T + return self._pairwise_dense_sparse(X, Y) #------------------------------------------------------------ # Euclidean Distance diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index ebd5f702eac9b..e11be4dab3e20 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -108,15 +108,19 @@ def test_cdist(metric_param_grid, X, Y): # DistanceMetric.pairwise must be consistent for all # combinations of formats in {sparse, dense}. D_sklearn = dm.pairwise(X, Y) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) D_sklearn = dm.pairwise(X_csr, Y_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) D_sklearn = dm.pairwise(X_csr, Y) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) D_sklearn = dm.pairwise(X, Y_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) @@ -136,15 +140,19 @@ def test_cdist_bool_metric(metric, X_bool, Y_bool): X_bool_csr, Y_bool_csr = sp.csr_matrix(X_bool), sp.csr_matrix(Y_bool) D_sklearn = dm.pairwise(X_bool, Y_bool) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist) D_sklearn = dm.pairwise(X_bool_csr, Y_bool_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist) D_sklearn = dm.pairwise(X_bool, Y_bool_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist) D_sklearn = dm.pairwise(X_bool_csr, Y_bool) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist) @@ -185,12 +193,15 @@ def test_pdist(metric_param_grid, X): dm = DistanceMetricInterface.get_metric(metric, **kwargs) D_sklearn = dm.pairwise(X) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_pdist, **rtol_dict) D_sklearn_csr = dm.pairwise(X_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) D_sklearn_csr = dm.pairwise(X_csr, X_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) @@ -203,7 +214,7 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): metric, param_grid = metric_param_grid keys = param_grid.keys() - # Chose rtol to make sure that this test is robust to changes in the random + # Choose rtol to make sure that this test is robust to changes in the random # seed in the module-level test data generation code. rtol = 1e-5 @@ -318,12 +329,15 @@ def haversine_slow(x1, x2): assert_allclose(D_sklearn, D_reference) D_sklearn = haversine.pairwise(X_csr, Y_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_reference) D_sklearn = haversine.pairwise(X_csr, Y) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_reference) D_sklearn = haversine.pairwise(X, Y_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_reference) From 3e3e8881d4bc1c6d11a9e473a1efd8c90bb5d1cc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Jun 2022 11:50:43 +0200 Subject: [PATCH 26/68] Add validation on X and Y, accepting CSR as inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- sklearn/metrics/_dist_metrics.pyx.tp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index a1ff483d6c702..98a77a61530d8 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -721,19 +721,25 @@ cdef class DistanceMetric{{name_suffix}}: Parameters ---------- - X : array-like - Array of shape (Nx, D), representing Nx points in D dimensions. - Y : array-like (optional) - Array of shape (Ny, D), representing Ny points in D dimensions. + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. If not specified, then Y=X. Returns ------- - dist : ndarray - The shape (Nx, Ny) array of pairwise distances between points in - X and Y. + dist : ndarray of shape (n_samples_X, n_samples_Y) + The distance matrix of pairwise distances between points in X and Y. """ Y = X if Y is None else Y + X_is_Y = X is Y + X = check_array(X, accept_sparse=['csr']) + + if X_is_Y: + Y = X + else: + Y = check_array(Y, accept_sparse=['csr']) X_is_sparse = issparse(X) Y_is_sparse = issparse(Y) From ddc49d5a6373b882492cdd93da6d6d9ef15cdd63 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Jun 2022 12:50:39 +0200 Subject: [PATCH 27/68] Remove left-overs Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 98a77a61530d8..3dd720454fdee 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -642,11 +642,10 @@ cdef class DistanceMetric{{name_suffix}}: return Darr def _pairwise_sparse_dense(self, X, Y): - X_csr = X.tocsr() - n_X, size = X_csr.shape - X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) - X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) - X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) + n_X, size = X.shape + X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) + X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) # To avoid introducing redundant implementations for the CSR × dense array # case, we wrap the dense array into a fake CSR datastructure and leverage @@ -684,12 +683,10 @@ cdef class DistanceMetric{{name_suffix}}: # have implemented this method using _pairwise_dense_sparse, # but this would have come with an extra copy to ensure # c-contiguity of the result. - - Y_csr = Y.tocsr() - n_Y, size = Y_csr.shape - Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}}) - Y_indices = np.asarray(Y_csr.indices, dtype=SPARSE_INDEX_TYPE) - Y_indptr = np.asarray(Y_csr.indptr, dtype=SPARSE_INDEX_TYPE) + n_Y, size = Y.shape + Y_data = np.asarray(Y.data, dtype={{INPUT_DTYPE}}) + Y_indices = np.asarray(Y.indices, dtype=SPARSE_INDEX_TYPE) + Y_indptr = np.asarray(Y.indptr, dtype=SPARSE_INDEX_TYPE) n_X, _ = X.shape X_data = X.reshape(-1) @@ -729,14 +726,12 @@ cdef class DistanceMetric{{name_suffix}}: Returns ------- - dist : ndarray of shape (n_samples_X, n_samples_Y) + dist : ndarray of shape (n_samples_X, n_samples_Y) The distance matrix of pairwise distances between points in X and Y. """ - Y = X if Y is None else Y - X_is_Y = X is Y X = check_array(X, accept_sparse=['csr']) - if X_is_Y: + if Y is None: Y = X else: Y = check_array(Y, accept_sparse=['csr']) From dec0aa8355c87bf945830f783c5d2d2ceb4d51e0 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Sat, 2 Jul 2022 21:44:28 +0200 Subject: [PATCH 28/68] Add support for all combinations of {dense,sparse} datasets pairs --- .../_datasets_pair.pxd | 37 ++- .../_datasets_pair.pyx | 219 +++++++++++++++++- .../_dispatcher.py | 3 - .../test_pairwise_distances_reduction.py | 101 +++++++- 4 files changed, 336 insertions(+), 24 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd index de6458f8c6f26..60fbbac3a6371 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd @@ -1,9 +1,11 @@ -from ...utils._typedefs cimport DTYPE_t, ITYPE_t +from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t from ...metrics._dist_metrics cimport DistanceMetric cdef class DatasetsPair: - cdef DistanceMetric distance_metric + cdef: + DistanceMetric distance_metric + ITYPE_t n_features cdef ITYPE_t n_samples_X(self) nogil @@ -18,4 +20,33 @@ cdef class DenseDenseDatasetsPair(DatasetsPair): cdef: const DTYPE_t[:, ::1] X const DTYPE_t[:, ::1] Y - ITYPE_t d + + +cdef class SparseSparseDatasetsPair(DatasetsPair): + cdef: + const DTYPE_t[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + + const DTYPE_t[:] Y_data + const SPARSE_INDEX_TYPE_t[:] Y_indices + const SPARSE_INDEX_TYPE_t[:] Y_indptr + + +cdef class SparseDenseDatasetsPair(DatasetsPair): + cdef: + const DTYPE_t[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + + const DTYPE_t[:] Y_data + const SPARSE_INDEX_TYPE_t[:] Y_indices + ITYPE_t n_Y + + +cdef class DenseSparseDatasetsPair(DatasetsPair): + cdef: + # As distance metrics are commutative, we can simply rely + # on the implementation of SparseDenseDatasetsPair and + # swap arguments. + DatasetsPair datasets_pair diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx index abef1bed098ed..12ec4a99ea7ac 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx @@ -2,11 +2,14 @@ import numpy as np cimport numpy as cnp from cython cimport final -from scipy.sparse import issparse +from scipy.sparse import issparse, csr_matrix from ...utils._typedefs cimport DTYPE_t, ITYPE_t from ...metrics._dist_metrics cimport DistanceMetric +from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE + + cnp.import_array() cdef class DatasetsPair: @@ -91,14 +94,31 @@ cdef class DatasetsPair: distance_metric._validate_data(X) distance_metric._validate_data(Y) - # TODO: dispatch to other dataset pairs for sparse support once available: - if issparse(X) or issparse(Y): - raise ValueError("Only dense datasets are supported for X and Y.") + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + + if not X_is_sparse and not Y_is_sparse: + return DenseDenseDatasetsPair(X, Y, distance_metric) - return DenseDenseDatasetsPair(X, Y, distance_metric) + if X_is_sparse and Y_is_sparse: + return SparseSparseDatasetsPair(X, Y, distance_metric) - def __init__(self, DistanceMetric distance_metric): + if X_is_sparse and not Y_is_sparse: + return SparseDenseDatasetsPair(X, Y, distance_metric) + + return DenseSparseDatasetsPair(X, Y, distance_metric) + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure getting ITYPE instead of int internally used for CSR matrices.""" + X_data = np.asarray(X.data, dtype=DTYPE) + X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) + return X_data, X_indices, X_indptr + + def __init__(self, DistanceMetric distance_metric, ITYPE_t n_features): self.distance_metric = distance_metric + self.n_features = n_features cdef ITYPE_t n_samples_X(self) nogil: """Number of samples in X.""" @@ -140,12 +160,16 @@ cdef class DenseDenseDatasetsPair(DatasetsPair): between two row vectors of (X, Y). """ - def __init__(self, X, Y, DistanceMetric distance_metric): - super().__init__(distance_metric) + def __init__( + self, + DTYPE_t[:, ::1] X, + DTYPE_t[:, ::1] Y, + DistanceMetric distance_metric, + ): + super().__init__(distance_metric, n_features=X.shape[1]) # Arrays have already been checked self.X = X self.Y = Y - self.d = X.shape[1] @final cdef ITYPE_t n_samples_X(self) nogil: @@ -157,8 +181,181 @@ cdef class DenseDenseDatasetsPair(DatasetsPair): @final cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: - return self.distance_metric.rdist(&self.X[i, 0], &self.Y[j, 0], self.d) + return self.distance_metric.rdist(&self.X[i, 0], &self.Y[j, 0], self.n_features) + + @final + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.distance_metric.dist(&self.X[i, 0], &self.Y[j, 0], self.n_features) + + +@final +cdef class SparseSparseDatasetsPair(DatasetsPair): + """Compute distances between vectors of two CSR matrices. + + Parameters + ---------- + X: sparse matrix of shape (n_samples_X, n_features) + Rows represent vectors. Must be in CSR format. + + Y: sparse matrix of shape (n_samples_Y, n_features) + Rows represent vectors. Must be in CSR format. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + def __init__(self, X, Y, DistanceMetric distance_metric): + super().__init__(distance_metric, n_features=X.shape[1]) + + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + + @final + cdef ITYPE_t n_samples_X(self) nogil: + return self.X_indptr.shape[0] - 1 + + @final + cdef ITYPE_t n_samples_Y(self) nogil: + return self.Y_indptr.shape[0] -1 + + @final + cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.distance_metric.rdist_csr( + x1_data=self.X_data, + x1_indices=self.X_indices, + x2_data=self.Y_data, + x2_indices=self.Y_indices, + x1_start=self.X_indptr[i], + x1_end=self.X_indptr[i + 1], + x2_start=self.Y_indptr[j], + x2_end=self.Y_indptr[j + 1], + size=self.n_features, + ) + + @final + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.distance_metric.dist_csr( + x1_data=self.X_data, + x1_indices=self.X_indices, + x2_data=self.Y_data, + x2_indices=self.Y_indices, + x1_start=self.X_indptr[i], + x1_end=self.X_indptr[i + 1], + x2_start=self.Y_indptr[j], + x2_end=self.Y_indptr[j + 1], + size=self.n_features, + ) + + +@final +cdef class SparseDenseDatasetsPair(DatasetsPair): + """Compute distances between vectors of a CSR matrix and a dense array. + + Parameters + ---------- + X: sparse matrix of shape (n_samples_X, n_features) + Rows represent vectors. Must be in CSR format. + + Y: ndarray of shape (n_samples_Y, n_features) + Rows represent vectors. Must be C-contiguous. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + def __init__(self, X, Y, DistanceMetric distance_metric): + super().__init__(distance_metric, n_features=X.shape[1]) + + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + + # Y array already has been checked here + self.n_Y = Y.shape[0] + self.Y_data = np.ravel(Y) + + # Since Y vectors are dense, we can use a single arrays + # of indices of self.n_features elements instead of + # a self.n_Y × self.n_features matrices. + # The implementations of DistanceMetric.{dist_csr,rdist_csr} + # support this representation. + self.Y_indices = np.arange(self.n_features, dtype=SPARSE_INDEX_TYPE) + + @final + cdef ITYPE_t n_samples_X(self) nogil: + return self.X_indptr.shape[0] - 1 + + @final + cdef ITYPE_t n_samples_Y(self) nogil: + return self.n_Y + + @final + cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.distance_metric.rdist_csr( + x1_data=self.X_data, + x1_indices=self.X_indices, + x2_data=self.Y_data, + x2_indices=self.Y_indices, + x1_start=self.X_indptr[i], + x1_end=self.X_indptr[i + 1], + x2_start=j * self.n_features, + x2_end=(j + 1) * self.n_features, + size=self.n_features, + ) + + @final + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.distance_metric.dist_csr( + x1_data=self.X_data, + x1_indices=self.X_indices, + x2_data=self.Y_data, + x2_indices=self.Y_indices, + x1_start=self.X_indptr[i], + x1_end=self.X_indptr[i + 1], + x2_start=j * self.n_features, + x2_end=(j + 1) * self.n_features, + size=self.n_features, + ) + + +@final +cdef class DenseSparseDatasetsPair(DatasetsPair): + """Compute distances between vectors of a dense array and a CSR matrix. + + Parameters + ---------- + X: ndarray of shape (n_samples_X, n_features) + Rows represent vectors. Must be C-contiguous. + + Y: sparse matrix of shape (n_samples_Y, n_features) + Rows represent vectors. Must be in CSR format. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + def __init__(self, X, Y, DistanceMetric distance_metric): + super().__init__(distance_metric, n_features=X.shape[1]) + # Swapping arguments on the constructor + self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric) + + @final + cdef ITYPE_t n_samples_X(self) nogil: + # Swapping interface + return self.datasets_pair.n_samples_Y() + + @final + cdef ITYPE_t n_samples_Y(self) nogil: + # Swapping interface + return self.datasets_pair.n_samples_X() + + @final + cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: + # Swapping arguments on the same interface + return self.datasets_pair.surrogate_dist(j, i) @final cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: - return self.distance_metric.dist(&self.X[i, 0], &self.Y[j, 0], self.d) + # Swapping arguments on the same interface + return self.datasets_pair.dist(j, i) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index a79fde694a9ed..932b671ad3e54 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -3,7 +3,6 @@ import numpy as np from typing import List -from scipy.sparse import issparse from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING from ._base import _sqeuclidean_row_norms64 @@ -82,8 +81,6 @@ def is_usable_for(cls, X, Y, metric) -> bool: dtypes_validity = X.dtype == Y.dtype == np.float64 return ( get_config().get("enable_cython_pairwise_dist", True) - and not issparse(X) - and not issparse(Y) and dtypes_validity and metric in cls.valid_metrics() ) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 0b9c6e6aad196..28d55b76c9905 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -513,11 +513,16 @@ def test_pairwise_distances_reduction_is_usable_for(): rng = np.random.RandomState(0) X = rng.rand(100, 10) Y = rng.rand(100, 10) + X_csr = csr_matrix(X) + Y_csr = csr_matrix(Y) metric = "euclidean" - assert PairwiseDistancesReduction.is_usable_for( - X.astype(np.float64), X.astype(np.float64), metric - ) + # Must be usable for all possible pair of {dense, sparse} datasets + assert PairwiseDistancesReduction.is_usable_for(X, Y, metric) + assert PairwiseDistancesReduction.is_usable_for(X_csr, Y_csr, metric) + assert PairwiseDistancesReduction.is_usable_for(X_csr, Y, metric) + assert PairwiseDistancesReduction.is_usable_for(X, Y_csr, metric) + assert not PairwiseDistancesReduction.is_usable_for( X.astype(np.int64), Y.astype(np.int64), metric ) @@ -526,10 +531,6 @@ def test_pairwise_distances_reduction_is_usable_for(): assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric) assert not PairwiseDistancesReduction.is_usable_for(X, Y.astype(np.int32), metric) - # TODO: remove once sparse matrices are supported - assert not PairwiseDistancesReduction.is_usable_for(csr_matrix(X), Y, metric) - assert not PairwiseDistancesReduction.is_usable_for(X, csr_matrix(Y), metric) - def test_argkmin_factory_method_wrong_usages(): rng = np.random.RandomState(1) @@ -750,6 +751,92 @@ def test_n_threads_agnosticism( ) +@pytest.mark.parametrize("n_samples", [100, 1000]) +@pytest.mark.parametrize("chunk_size", [50, 512, 1024]) +@pytest.mark.parametrize( + "PairwiseDistancesReduction", + [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], +) +def test_format_agnosticism( + global_random_seed, + PairwiseDistancesReduction, + n_samples, + chunk_size, + n_features=100, + dtype=np.float64, +): + # Results must not depend on the number of threads + rng = np.random.RandomState(global_random_seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + X_csr = csr_matrix(X) + Y_csr = csr_matrix(Y) + + if PairwiseDistancesReduction is PairwiseDistancesArgKmin: + parameter = 10 + check_parameters = {} + else: + # Scaling the radius slightly with the numbers of dimensions + radius = 10 ** np.log(n_features) + parameter = radius + check_parameters = {"radius": radius} + + # XXX: use itertools.pairwise when available? + dist_dense_dense, indices_dense_dense = PairwiseDistancesReduction.compute( + X, + Y, + parameter, + return_distance=True, + ) + + dist_sparse_sparse, indices_sparse_sparse = PairwiseDistancesReduction.compute( + X_csr, + Y_csr, + parameter, + return_distance=True, + ) + + ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( + dist_dense_dense, + dist_sparse_sparse, + indices_dense_dense, + indices_sparse_sparse, + **check_parameters, + ) + + dist_dense_sparse, indices_dense_sparse = PairwiseDistancesReduction.compute( + X, + Y_csr, + parameter, + return_distance=True, + ) + + ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( + dist_dense_dense, + dist_dense_sparse, + indices_dense_dense, + indices_dense_sparse, + **check_parameters, + ) + + dist_sparse_dense, indices_sparse_dense = PairwiseDistancesReduction.compute( + X_csr, + Y, + parameter, + return_distance=True, + ) + + ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( + dist_dense_dense, + dist_sparse_dense, + indices_dense_dense, + indices_sparse_dense, + **check_parameters, + ) + + # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("n_samples", [100, 1000]) From 63c6fe36e72a4e1aa9490201e856eb8361633b06 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 5 Jul 2022 14:42:34 +0200 Subject: [PATCH 29/68] Const-qualify X and Y --- .../metrics/_pairwise_distances_reduction/_datasets_pair.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx index 12ec4a99ea7ac..70410d86016d8 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx @@ -162,8 +162,8 @@ cdef class DenseDenseDatasetsPair(DatasetsPair): def __init__( self, - DTYPE_t[:, ::1] X, - DTYPE_t[:, ::1] Y, + const DTYPE_t[:, ::1] X, + const DTYPE_t[:, ::1] Y, DistanceMetric distance_metric, ): super().__init__(distance_metric, n_features=X.shape[1]) From 30e84af5f489b4ffd30c6dd84d4d86061cf65f86 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 5 Jul 2022 16:14:05 +0200 Subject: [PATCH 30/68] Only pass Y_norm_squared for the dense-dense case --- sklearn/cluster/_birch.py | 8 +++++++- .../metrics/_pairwise_distances_reduction/_argkmin.pyx | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 20a414d1ac56e..ceace7bf739d8 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -694,7 +694,13 @@ def predict(self, X): def _predict(self, X): """Predict data using the ``centroids_`` of subclusters.""" - kwargs = {"Y_norm_squared": self._subcluster_norms} + + # The extra `Y_norm_squared` argument for the back-end + # is only supported for the dense-dense case. + if not sparse.issparse(X) and not sparse.issparse(self.subcluster_centers_): + kwargs = {"Y_norm_squared": self._subcluster_norms} + else: + kwargs = {} with config_context(assume_finite=True): argmin = pairwise_distances_argmin( diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx index 2f378543e1f97..acadd2e2f2867 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx @@ -79,8 +79,8 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): metric_kwargs=metric_kwargs, ) else: - # Fall back on a generic implementation that handles most scipy - # metrics by computing the distances between 2 vectors at a time. + # Fall back on a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. pda = PairwiseDistancesArgKmin64( datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), k=k, From 780d7bbf5a79c0b2285fc2cad015bb0efe330c02 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 6 Jul 2022 11:32:56 +0200 Subject: [PATCH 31/68] Update comments --- sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx | 4 ++-- .../_pairwise_distances_reduction/_radius_neighborhood.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx index acadd2e2f2867..afad58ae67297 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx @@ -79,8 +79,8 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): metric_kwargs=metric_kwargs, ) else: - # Fall back on a generic implementation that handles most scipy - # metrics by computing the distances between 2 vectors at a time. + # Fall back on a generic implementation that handles all distance + # metrics by computing it between 2 vectors at a time. pda = PairwiseDistancesArgKmin64( datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), k=k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx index db2c22e89d06d..0b88e70c28534 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx @@ -103,8 +103,8 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): sort_results=sort_results, ) else: - # Fall back on a generic implementation that handles most scipy - # metrics by computing the distances between 2 vectors at a time. + # Fall back on a generic implementation that handles all distance + # metrics by computing it between 2 vectors at a time. pda = PairwiseDistancesRadiusNeighborhood64( datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), radius=radius, From 72f4ae7bb7e00e9a88c30269455c00aa9bb4f546 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 19 Jul 2022 09:50:08 +0200 Subject: [PATCH 32/68] Pop unused keywords arguments --- sklearn/cluster/_birch.py | 8 +------- .../metrics/_pairwise_distances_reduction/_argkmin.pyx | 6 ++++++ .../_radius_neighborhood.pyx | 8 +++++--- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index ceace7bf739d8..20a414d1ac56e 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -694,13 +694,7 @@ def predict(self, X): def _predict(self, X): """Predict data using the ``centroids_`` of subclusters.""" - - # The extra `Y_norm_squared` argument for the back-end - # is only supported for the dense-dense case. - if not sparse.issparse(X) and not sparse.issparse(self.subcluster_centers_): - kwargs = {"Y_norm_squared": self._subcluster_norms} - else: - kwargs = {} + kwargs = {"Y_norm_squared": self._subcluster_norms} with config_context(assume_finite=True): argmin = pairwise_distances_argmin( diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx index afad58ae67297..734e8ada2c55e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx @@ -4,6 +4,7 @@ from libc.stdlib cimport free, malloc from libc.float cimport DBL_MAX from cython cimport final from cython.parallel cimport parallel, prange +from scipy import sparse from ._base cimport ( PairwiseDistancesReduction64, @@ -81,6 +82,11 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): else: # Fall back on a generic implementation that handles all distance # metrics by computing it between 2 vectors at a time. + + # The extra `Y_norm_squared` argument for the back-end is only + # supported for the FastEuclidean variant. + metric_kwargs.pop("Y_norm_squared", None) + pda = PairwiseDistancesArgKmin64( datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), k=k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx index 0b88e70c28534..5f03bc7bbf246 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx @@ -105,11 +105,15 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): else: # Fall back on a generic implementation that handles all distance # metrics by computing it between 2 vectors at a time. + + # The extra `Y_norm_squared` argument for the back-end is only + # supported for the FastEuclidean variant. + metric_kwargs.pop("Y_norm_squared", None) + pda = PairwiseDistancesRadiusNeighborhood64( datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), radius=radius, chunk_size=chunk_size, - metric_kwargs=metric_kwargs, strategy=strategy, sort_results=sort_results, ) @@ -132,7 +136,6 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): chunk_size=None, strategy=None, sort_results=False, - metric_kwargs=None, ): super().__init__( datasets_pair=datasets_pair, @@ -355,7 +358,6 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesR chunk_size=chunk_size, strategy=strategy, sort_results=sort_results, - metric_kwargs=metric_kwargs, ) # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair cdef: From a1ce04215653155b5e5f707b0a271acae85848f1 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 19 Jul 2022 10:11:04 +0200 Subject: [PATCH 33/68] Remove unused import --- sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx index 734e8ada2c55e..c5f8609eb45dd 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx @@ -4,7 +4,6 @@ from libc.stdlib cimport free, malloc from libc.float cimport DBL_MAX from cython cimport final from cython.parallel cimport parallel, prange -from scipy import sparse from ._base cimport ( PairwiseDistancesReduction64, From 713b932d542650c8d15fb5a0118fb7b7a546c382 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 19 Jul 2022 10:18:57 +0200 Subject: [PATCH 34/68] DOC Add whats_new entry --- doc/whats_new/v1.2.rst | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 832a4e4389b19..2b7a4c2b6dbd7 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -44,6 +44,32 @@ Changes impacting all modules NumPy's SIMD optimized primitives. :pr:`23446` by :user:`Meekail Zain ` +- |Enhancement| Support for combinations of dense and sparse datasets pairs + for all distance metrics has been added on the following estimators: + + - :func:`sklearn.metrics.pairwise_distances_argmin` + - :func:`sklearn.metrics.pairwise_distances_argmin_min` + - :class:`sklearn.cluster.AffinityPropagation` + - :class:`sklearn.cluster.Birch` + - :class:`sklearn.cluster.MeanShift` + - :class:`sklearn.cluster.OPTICS` + - :class:`sklearn.cluster.SpectralClustering` + - :func:`sklearn.feature_selection.mutual_info_regression` + - :class:`sklearn.neighbors.KNeighborsClassifier` + - :class:`sklearn.neighbors.KNeighborsRegressor` + - :class:`sklearn.neighbors.RadiusNeighborsClassifier` + - :class:`sklearn.neighbors.RadiusNeighborsRegressor` + - :class:`sklearn.neighbors.LocalOutlierFactor` + - :class:`sklearn.neighbors.NearestNeighbors` + - :class:`sklearn.manifold.Isomap` + - :class:`sklearn.manifold.LocallyLinearEmbedding` + - :class:`sklearn.manifold.TSNE` + - :func:`sklearn.manifold.trustworthiness` + - :class:`sklearn.semi_supervised.LabelPropagation` + - :class:`sklearn.semi_supervised.LabelSpreading` + + :pr:`23585` by `Julien Jerphanion ` + Changelog --------- From 5570f71db04fa691f360c561d2f3c8959a613025 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 19 Jul 2022 11:01:11 +0200 Subject: [PATCH 35/68] fixup! Pop unused keywords arguments --- sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx | 3 ++- .../_pairwise_distances_reduction/_radius_neighborhood.pyx | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx index c5f8609eb45dd..0c3fe509bd459 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx @@ -84,7 +84,8 @@ cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): # The extra `Y_norm_squared` argument for the back-end is only # supported for the FastEuclidean variant. - metric_kwargs.pop("Y_norm_squared", None) + if metric_kwargs is not None: + metric_kwargs.pop("Y_norm_squared", None) pda = PairwiseDistancesArgKmin64( datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx index 5f03bc7bbf246..3f3e22869ae04 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx @@ -108,7 +108,8 @@ cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): # The extra `Y_norm_squared` argument for the back-end is only # supported for the FastEuclidean variant. - metric_kwargs.pop("Y_norm_squared", None) + if metric_kwargs is not None: + metric_kwargs.pop("Y_norm_squared", None) pda = PairwiseDistancesRadiusNeighborhood64( datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), From 4c455ea440839eedd6f4b681d0b6eaa636aee4f4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 20 Jul 2022 15:50:41 +0200 Subject: [PATCH 36/68] DOC Update comment and changelog Co-authored-by: Olivier Grisel --- doc/whats_new/v1.2.rst | 6 ------ .../_pairwise_distances_reduction/_datasets_pair.pyx | 7 ++++--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index f9cc448595477..7052bdbed3426 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -56,10 +56,7 @@ Changes impacting all modules - :func:`sklearn.metrics.pairwise_distances_argmin_min` - :class:`sklearn.cluster.AffinityPropagation` - :class:`sklearn.cluster.Birch` - - :class:`sklearn.cluster.MeanShift` - - :class:`sklearn.cluster.OPTICS` - :class:`sklearn.cluster.SpectralClustering` - - :func:`sklearn.feature_selection.mutual_info_regression` - :class:`sklearn.neighbors.KNeighborsClassifier` - :class:`sklearn.neighbors.KNeighborsRegressor` - :class:`sklearn.neighbors.RadiusNeighborsClassifier` @@ -67,11 +64,8 @@ Changes impacting all modules - :class:`sklearn.neighbors.LocalOutlierFactor` - :class:`sklearn.neighbors.NearestNeighbors` - :class:`sklearn.manifold.Isomap` - - :class:`sklearn.manifold.LocallyLinearEmbedding` - :class:`sklearn.manifold.TSNE` - :func:`sklearn.manifold.trustworthiness` - - :class:`sklearn.semi_supervised.LabelPropagation` - - :class:`sklearn.semi_supervised.LabelSpreading` :pr:`23585` by `Julien Jerphanion ` diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx index 70410d86016d8..152b4c56afec1 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx @@ -110,7 +110,8 @@ cdef class DatasetsPair: @classmethod def unpack_csr_matrix(cls, X: csr_matrix): - """Ensure getting ITYPE instead of int internally used for CSR matrices.""" + """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" + # TODO: leave X.data unchanged once float32 is supported. X_data = np.asarray(X.data, dtype=DTYPE) X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) @@ -274,9 +275,9 @@ cdef class SparseDenseDatasetsPair(DatasetsPair): self.n_Y = Y.shape[0] self.Y_data = np.ravel(Y) - # Since Y vectors are dense, we can use a single arrays + # Since Y vectors are dense, we can use a single array # of indices of self.n_features elements instead of - # a self.n_Y × self.n_features matrices. + # a self.n_Y × self.n_features matrix. # The implementations of DistanceMetric.{dist_csr,rdist_csr} # support this representation. self.Y_indices = np.arange(self.n_features, dtype=SPARSE_INDEX_TYPE) From 0f0ea70ec3e0597de5ecf7f65c352777a202212f Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 21 Jul 2022 11:35:22 +0200 Subject: [PATCH 37/68] MAINT Test second alternative for sparse-dense support The modulo trick is costly because it uses integer division under the hood which are up to 80 times slower. An alternative is to pass the address of the `indices` array and shift this address at the callers level with the information that we have there so that the dereferencing works properly. This commit replaces the modulo trick by this alternative and adapt documentation and comments in this regard. See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1190672626 --- sklearn/metrics/_dist_metrics.pxd.tp | 14 +- sklearn/metrics/_dist_metrics.pyx.tp | 450 ++++++++---------- .../_datasets_pair.pyx | 64 ++- 3 files changed, 240 insertions(+), 288 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 8f8aa21107015..fac4202c419ca 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -102,9 +102,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -115,9 +115,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -141,7 +141,7 @@ cdef class DistanceMetric{{name_suffix}}: cdef int pdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, @@ -150,10 +150,10 @@ cdef class DistanceMetric{{name_suffix}}: cdef int cdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index a7574bff86510..19343aaa4274e 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -396,9 +396,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -412,17 +412,21 @@ cdef class DistanceMetric{{name_suffix}}: Notes ----- - The implementation of this method in subclasses must be robust to the + 0. The implementation of this method in subclasses must be robust to the presence of explicit zeros in the CSR representation. - An alternative signature would be: + 1. The `indices` are passed using pointers to be able to support an + alternative representation of the CSR data structure for supporting + fused sparse-dense datasets pairs. See the + + 2. An alternative signature would be: cdef DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, ) nogil except -1: Where calles would use slicing on the original CSR data and indices @@ -435,19 +439,19 @@ cdef class DistanceMetric{{name_suffix}}: self.dist_csr( x1_data[x1_start:x1_end], - x1_indices[x1_start:x1_end], + &x1_indices[x1_start], x2_data[x2_start:x2_end], - x2_indices[x2_start:x2_end], + &x2_indices[x2_start], ) Yet, slicing on memoryview slows down execution as it takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 Hence, to avoid slicing the data and indices arrays of the sparse - matrices containing respectively x1 and x2 (namely x{1,2}_{data,indice}) - are passed as well as their indice pointers (namely x{1,2}_{start,end}). + matrices containing respectively x1 and x2 (namely x{1,2}_{data,indices}) + are passed as well as their indices pointers (namely x{1,2}_{start,end}). - For reference about the CSR format, see section 3.4 of + 3. For reference about the CSR format, see section 3.4 of Saad, Y. (2003), Iterative Methods for Sparse Linear Systems, SIAM. https://www-users.cse.umn.edu/~saad/IterMethBook_2ndEd.pdf """ @@ -456,9 +460,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -497,7 +501,7 @@ cdef class DistanceMetric{{name_suffix}}: cdef int pdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, @@ -534,10 +538,10 @@ cdef class DistanceMetric{{name_suffix}}: cdef int cdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, @@ -635,18 +639,30 @@ cdef class DistanceMetric{{name_suffix}}: self.cdist(Xarr, Yarr, Darr) return Darr - def _pairwise_sparse_sparse(self, X, Y): + def _pairwise_sparse_sparse(self, X: csr_matrix , Y: csr_matrix): + cdef: + ITYPE_t n_X, size + const {{INPUT_DTYPE_t}}[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + + ITYPE_t n_Y + const {{INPUT_DTYPE_t}}[:] Y_data + const SPARSE_INDEX_TYPE_t[:] Y_indices + const SPARSE_INDEX_TYPE_t[:] Y_indptr + + DTYPE_t[:, ::1] Darr + X_csr = X.tocsr() n_X, size = X_csr.shape X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) - if X is Y: Darr = np.empty((n_X, n_X), dtype=DTYPE, order='C') self.pdist_csr( x1_data=X_data, - x1_indices=X_indices, + x1_indices=&X_indices[0], x1_indptr=X_indptr, size=size, D=Darr, @@ -661,77 +677,81 @@ cdef class DistanceMetric{{name_suffix}}: Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.cdist_csr( x1_data=X_data, - x1_indices=X_indices, + x1_indices=&X_indices[0], x1_indptr=X_indptr, x2_data=Y_data, - x2_indices=Y_indices, + x2_indices=&Y_indices[0], x2_indptr=Y_indptr, size=size, D=Darr, ) return Darr - def _pairwise_sparse_dense(self, X, Y): - n_X, size = X.shape - X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) - X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) - X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) - - # To avoid introducing redundant implementations for the CSR × dense array - # case, we wrap the dense array into a fake CSR datastructure and leverage - # the existing code for the CSR × CSR case. - # The true CSR representation of a dense array would require allocating - # a Y_indices matrix of shape (n_samples, n_features) with repeated - # contiguous integers from 0 to n_features - 1 on each row which would - # be very wasteful from a memory point of view. Instead we only allocate - # a single row and adapt the CSR × CSR routines to use a modulo operation - # when accessing Y_indices in order to achieve the same result without having - # to materialize the indices repetition explicitly. - - n_Y, _ = Y.shape - Y_data = Y.reshape(-1) - Y_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) - Y_indptr = np.arange( - start=0, stop=size * (n_Y + 1), step=size, dtype=SPARSE_INDEX_TYPE - ) + def _pairwise_sparse_dense(self, + X: csr_matrix, + const {{INPUT_DTYPE_t}}[:, ::1] Y, + ): + cdef: + ITYPE_t n_X = X.shape[0] + ITYPE_t size = X.shape[1] + const {{INPUT_DTYPE_t}}[:] X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) + const SPARSE_INDEX_TYPE_t[:] X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + const SPARSE_INDEX_TYPE_t[:] X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) + + # To understand the CSR format used here, see the comment in + # `SparseDenseDatasetsPair.__init__`. + ITYPE_t n_Y = Y.shape[0] + const {{INPUT_DTYPE_t}}[:] Y_data = Y.reshape(-1) + const SPARSE_INDEX_TYPE_t[:] Y_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) + const SPARSE_INDEX_TYPE_t[:] Y_indptr = np.arange( + start=0, stop=size * (n_X + 1), step=size, dtype=SPARSE_INDEX_TYPE + ) + + DTYPE_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') - Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.cdist_csr( x1_data=X_data, - x1_indices=X_indices, + x1_indices=&X_indices[0], x1_indptr=X_indptr, x2_data=Y_data, - x2_indices=Y_indices, + x2_indices=&Y_indices[0], x2_indptr=Y_indptr, size=size, D=Darr, ) return Darr - def _pairwise_dense_sparse(self, X, Y): - # Same remark as in _pairwise_sparse_dense. We could - # have implemented this method using _pairwise_dense_sparse, - # but this would have come with an extra copy to ensure - # c-contiguity of the result. - n_Y, size = Y.shape - Y_data = np.asarray(Y.data, dtype={{INPUT_DTYPE}}) - Y_indices = np.asarray(Y.indices, dtype=SPARSE_INDEX_TYPE) - Y_indptr = np.asarray(Y.indptr, dtype=SPARSE_INDEX_TYPE) - - n_X, _ = X.shape - X_data = X.reshape(-1) - X_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) - X_indptr = np.arange( - start=0, stop=size * (n_X + 1), step=size, dtype=SPARSE_INDEX_TYPE - ) + def _pairwise_dense_sparse(self, + const {{INPUT_DTYPE_t}}[:, ::1] X, + Y: csr_matrix, + ): + # We could have implemented this method using _pairwise_dense_sparse by + # swapping argument and by transposing the results, but this would + # have come with an extra copy to ensure C-contiguity of the result. + cdef: + ITYPE_t n_X = X.shape[0] + ITYPE_t size = X.shape[1] + + # To understand the CSR format used here, see the comment in + # `SparseDenseDatasetsPair.__init__` + const {{INPUT_DTYPE_t}}[:] X_data = X.reshape(-1) + const SPARSE_INDEX_TYPE_t[:] X_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) + const SPARSE_INDEX_TYPE_t[:] X_indptr = np.arange( + start=0, stop=size * (n_X + 1), step=size, dtype=SPARSE_INDEX_TYPE + ) + ITYPE_t n_Y = Y.shape[0] + const {{INPUT_DTYPE_t}}[:] Y_data = np.asarray(Y.data, dtype={{INPUT_DTYPE}}) + const SPARSE_INDEX_TYPE_t[:] Y_indices = np.asarray(Y.indices, dtype=SPARSE_INDEX_TYPE) + const SPARSE_INDEX_TYPE_t[:] Y_indptr = np.asarray(Y.indptr, dtype=SPARSE_INDEX_TYPE) + + DTYPE_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') - Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.cdist_csr( x1_data=X_data, - x1_indices=X_indices, + x1_indices=&X_indices[0], x1_indptr=X_indptr, x2_data=Y_data, - x2_indices=Y_indices, + x2_indices=&Y_indices[0], x2_indptr=Y_indptr, size=size, D=Darr, @@ -818,9 +838,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -832,18 +852,13 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 DTYPE_t unsquared = 0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: unsquared = x1_data[i1] - x2_data[i2] @@ -875,9 +890,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -951,9 +966,9 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -965,18 +980,13 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 DTYPE_t unsquared = 0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: unsquared = x1_data[i1] - x2_data[i2] @@ -994,13 +1004,13 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2 % len_x2_indices] + ix2 = x2_indices[i2] unsquared = x2_data[i2] d = d + (unsquared * unsquared) / self.vec[ix2] i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1 % len_x1_indices] + ix1 = x1_indices[i1] unsquared = x1_data[i1] d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 @@ -1009,9 +1019,9 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1058,9 +1068,9 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1072,17 +1082,12 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: d = d + fabs(x1_data[i1] - x2_data[i2]) @@ -1147,9 +1152,9 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1161,17 +1166,12 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) @@ -1289,9 +1289,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1303,19 +1303,14 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 bint has_w = self.size > 0 if has_w: while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: d = d + (self.vec[ix1] * pow(fabs( @@ -1332,23 +1327,20 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2 % len_x2_indices] + ix2 = x2_indices[i2] d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1 % len_x1_indices] + ix1 = x1_indices[i1] d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d else: while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: d = d + (pow(fabs( @@ -1377,9 +1369,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1481,9 +1473,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1495,17 +1487,12 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: d = d + pow(self.vec[ix1] * fabs( @@ -1522,12 +1509,12 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2 % len_x2_indices] + ix2 = x2_indices[i2] d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1 % len_x1_indices] + ix1 = x1_indices[i1] d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 @@ -1536,9 +1523,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1641,9 +1628,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1655,17 +1642,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t tmp, d = 0.0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: self.vec[ix1] = x1_data[i1] - x2_data[i2] @@ -1680,12 +1662,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2 % len_x2_indices] + ix2 = x2_indices[i2] self.vec[ix2] = - x2_data[i2] i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1 % len_x1_indices] + ix1 = x1_indices[i1] self.vec[ix1] = x1_data[i1] i1 = i1 + 1 @@ -1700,9 +1682,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1751,9 +1733,9 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1765,17 +1747,12 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: d += (x1_data[i1] != x2_data[i2]) @@ -1831,9 +1808,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1845,17 +1822,12 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: d += ( @@ -1913,9 +1885,9 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1927,18 +1899,13 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t num = 0.0 DTYPE_t denom = 0.0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: num += fabs(x1_data[i1] - x2_data[i2]) @@ -2002,9 +1969,9 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2016,17 +1983,12 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, nnz = 0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2090,9 +2052,9 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2104,17 +2066,12 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] if ix1 == ix2: tf1 = x1_data[i1] != 0 @@ -2171,9 +2128,9 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2185,17 +2142,12 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2257,9 +2209,9 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2271,17 +2223,12 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2340,9 +2287,9 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2354,17 +2301,12 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2422,9 +2364,9 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2436,17 +2378,12 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2497,9 +2434,9 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2511,17 +2448,12 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2580,9 +2512,9 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2594,17 +2526,12 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2689,9 +2616,9 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2713,9 +2640,9 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}[:] x1_data, - const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t* x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, - const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2727,8 +2654,6 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start - cnp.npy_intp len_x1_indices = x1_indices.shape[0] - cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t x1_0 = 0 DTYPE_t x1_1 = 0 @@ -2738,11 +2663,8 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t sin_1 while i1 < x1_end and i2 < x2_end: - # Use the modulo-trick to implement support for CSR × dense array - # with the CSR × CSR routine. See _pairwise_sparse_dense for more - # details. - ix1 = x1_indices[i1 % len_x1_indices] - ix2 = x2_indices[i2 % len_x2_indices] + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] # Find the components in the 2D vectors to work with x1_component = ix1 if (x1_start == 0) else ix1 % x1_start @@ -2763,7 +2685,7 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2 % len_x2_indices] + ix2 = x2_indices[i2] x2_component = ix2 if (x2_start == 0) else ix2 % x2_start if x2_component == 0: x2_0 = x2_data[i2] @@ -2772,7 +2694,7 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1 % len_x1_indices] + ix1 = x1_indices[i1] x1_component = ix1 if (x1_start == 0) else ix1 % x1_start if x1_component == 0: x1_0 = x1_data[i1] diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx index 70410d86016d8..c8cb7eebba3e7 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx @@ -223,9 +223,9 @@ cdef class SparseSparseDatasetsPair(DatasetsPair): cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: return self.distance_metric.rdist_csr( x1_data=self.X_data, - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], x2_data=self.Y_data, - x2_indices=self.Y_indices, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=self.Y_indptr[j], @@ -237,9 +237,9 @@ cdef class SparseSparseDatasetsPair(DatasetsPair): cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: return self.distance_metric.dist_csr( x1_data=self.X_data, - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], x2_data=self.Y_data, - x2_indices=self.Y_indices, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=self.Y_indptr[j], @@ -270,15 +270,31 @@ cdef class SparseDenseDatasetsPair(DatasetsPair): self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + # We support the sparse-dense case by using the sparse-sparse case to avoid + # introducing a new complex set of interfaces. + # + # To do so, we use an alternative CSR datastructure is used where: + # - the `data` array is the original dense array, `Y` + # - the `indices` array is a single row of n_features elements: + # + # [0, 1, ..., n_features-1] + # + # - the `indptr` array is a single row of n_X + 1 elements: + # + # [0, n_features, 2 * n_features, ..., n_features * n_X] + # + # If we were to use the natural CSR representation for the dense array, + # the indices would have required allocating an array of + # n_samples × n_features elements with repeated contiguous integers from + # 0 to n_features - 1, and this would have been very wasteful from a memory + # point of view. This alternative representation just uses the necessary + # amount of information needed and only necessitates shifting + # the address of `indices` before calling the CSR × CSR routines. + # Y array already has been checked here self.n_Y = Y.shape[0] self.Y_data = np.ravel(Y) - # Since Y vectors are dense, we can use a single arrays - # of indices of self.n_features elements instead of - # a self.n_Y × self.n_features matrices. - # The implementations of DistanceMetric.{dist_csr,rdist_csr} - # support this representation. self.Y_indices = np.arange(self.n_features, dtype=SPARSE_INDEX_TYPE) @final @@ -291,29 +307,43 @@ cdef class SparseDenseDatasetsPair(DatasetsPair): @final cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: + cdef: + ITYPE_t x2_start = j * self.n_features + ITYPE_t x2_end = (j + 1) * self.n_features + return self.distance_metric.rdist_csr( x1_data=self.X_data, - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], x2_data=self.Y_data, - x2_indices=self.Y_indices, + # To use the same indices array, we shift the array address to map + # accesses in [x2_start, x2_end) to + # accesses in [0, x2_end - x2_start) == [0, n_features). + x2_indices=&self.Y_indices[0] - x2_start, x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], - x2_start=j * self.n_features, - x2_end=(j + 1) * self.n_features, + x2_start=x2_start, + x2_end=x2_end, size=self.n_features, ) @final cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + cdef: + ITYPE_t x2_start = j * self.n_features + ITYPE_t x2_end = (j + 1) * self.n_features + return self.distance_metric.dist_csr( x1_data=self.X_data, - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], x2_data=self.Y_data, - x2_indices=self.Y_indices, + # To use the same `indices` array, we shift the array address to map + # accesses in [x2_start, x2_end) to + # accesses in [0, x2_end - x2_start) == [0, n_features). + x2_indices=&self.Y_indices[0] - x2_start, x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], - x2_start=j * self.n_features, - x2_end=(j + 1) * self.n_features, + x2_start=x2_start, + x2_end=x2_end, size=self.n_features, ) From a3cf4d81899021f65d0f62278518db5741a7ea6d Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 20 Jul 2022 15:50:41 +0200 Subject: [PATCH 38/68] DOC Update comment and changelog Co-authored-by: Olivier Grisel --- doc/whats_new/v1.2.rst | 6 ------ .../_pairwise_distances_reduction/_datasets_pair.pyx | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index f9cc448595477..7052bdbed3426 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -56,10 +56,7 @@ Changes impacting all modules - :func:`sklearn.metrics.pairwise_distances_argmin_min` - :class:`sklearn.cluster.AffinityPropagation` - :class:`sklearn.cluster.Birch` - - :class:`sklearn.cluster.MeanShift` - - :class:`sklearn.cluster.OPTICS` - :class:`sklearn.cluster.SpectralClustering` - - :func:`sklearn.feature_selection.mutual_info_regression` - :class:`sklearn.neighbors.KNeighborsClassifier` - :class:`sklearn.neighbors.KNeighborsRegressor` - :class:`sklearn.neighbors.RadiusNeighborsClassifier` @@ -67,11 +64,8 @@ Changes impacting all modules - :class:`sklearn.neighbors.LocalOutlierFactor` - :class:`sklearn.neighbors.NearestNeighbors` - :class:`sklearn.manifold.Isomap` - - :class:`sklearn.manifold.LocallyLinearEmbedding` - :class:`sklearn.manifold.TSNE` - :func:`sklearn.manifold.trustworthiness` - - :class:`sklearn.semi_supervised.LabelPropagation` - - :class:`sklearn.semi_supervised.LabelSpreading` :pr:`23585` by `Julien Jerphanion ` diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx index c8cb7eebba3e7..8d455e51b14e9 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx @@ -110,7 +110,8 @@ cdef class DatasetsPair: @classmethod def unpack_csr_matrix(cls, X: csr_matrix): - """Ensure getting ITYPE instead of int internally used for CSR matrices.""" + """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" + # TODO: leave X.data unchanged once float32 is supported. X_data = np.asarray(X.data, dtype=DTYPE) X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) @@ -294,7 +295,6 @@ cdef class SparseDenseDatasetsPair(DatasetsPair): # Y array already has been checked here self.n_Y = Y.shape[0] self.Y_data = np.ravel(Y) - self.Y_indices = np.arange(self.n_features, dtype=SPARSE_INDEX_TYPE) @final From e9ecbbc8ca856b46b36c9069b1720000dbfda17c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 10 Aug 2022 17:55:10 +0200 Subject: [PATCH 39/68] DOC Improve comments and code self-documentation Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 32 ++++++++++--------- .../_datasets_pair.pyx | 12 ++++--- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 19343aaa4274e..4745ea3e0bf55 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -417,7 +417,9 @@ cdef class DistanceMetric{{name_suffix}}: 1. The `indices` are passed using pointers to be able to support an alternative representation of the CSR data structure for supporting - fused sparse-dense datasets pairs. See the + fused sparse-dense datasets pairs. + + See the explanations in `SparseDenseDatasetsPair.__init__`. 2. An alternative signature would be: @@ -429,8 +431,8 @@ cdef class DistanceMetric{{name_suffix}}: const SPARSE_INDEX_TYPE_t* x2_indices, ) nogil except -1: - Where calles would use slicing on the original CSR data and indices - memoryview: + Where callees would use slicing on the original CSR data and indices + memoryviews: x1_start = X1_csr.indices_ptr[i] x1_end = X1_csr.indices_ptr[i+1] @@ -641,7 +643,7 @@ cdef class DistanceMetric{{name_suffix}}: def _pairwise_sparse_sparse(self, X: csr_matrix , Y: csr_matrix): cdef: - ITYPE_t n_X, size + ITYPE_t n_X, n_features const {{INPUT_DTYPE_t}}[:] X_data const SPARSE_INDEX_TYPE_t[:] X_indices const SPARSE_INDEX_TYPE_t[:] X_indptr @@ -654,7 +656,7 @@ cdef class DistanceMetric{{name_suffix}}: DTYPE_t[:, ::1] Darr X_csr = X.tocsr() - n_X, size = X_csr.shape + n_X, n_features = X_csr.shape X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) @@ -664,7 +666,7 @@ cdef class DistanceMetric{{name_suffix}}: x1_data=X_data, x1_indices=&X_indices[0], x1_indptr=X_indptr, - size=size, + size=n_features, D=Darr, ) else: @@ -682,7 +684,7 @@ cdef class DistanceMetric{{name_suffix}}: x2_data=Y_data, x2_indices=&Y_indices[0], x2_indptr=Y_indptr, - size=size, + size=n_features, D=Darr, ) return Darr @@ -693,7 +695,7 @@ cdef class DistanceMetric{{name_suffix}}: ): cdef: ITYPE_t n_X = X.shape[0] - ITYPE_t size = X.shape[1] + ITYPE_t n_features = X.shape[1] const {{INPUT_DTYPE_t}}[:] X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) const SPARSE_INDEX_TYPE_t[:] X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) const SPARSE_INDEX_TYPE_t[:] X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) @@ -702,9 +704,9 @@ cdef class DistanceMetric{{name_suffix}}: # `SparseDenseDatasetsPair.__init__`. ITYPE_t n_Y = Y.shape[0] const {{INPUT_DTYPE_t}}[:] Y_data = Y.reshape(-1) - const SPARSE_INDEX_TYPE_t[:] Y_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) + const SPARSE_INDEX_TYPE_t[:] Y_indices = np.arange(n_features, dtype=SPARSE_INDEX_TYPE) const SPARSE_INDEX_TYPE_t[:] Y_indptr = np.arange( - start=0, stop=size * (n_X + 1), step=size, dtype=SPARSE_INDEX_TYPE + start=0, stop=n_features * (n_X + 1), step=n_features, dtype=SPARSE_INDEX_TYPE ) DTYPE_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') @@ -716,7 +718,7 @@ cdef class DistanceMetric{{name_suffix}}: x2_data=Y_data, x2_indices=&Y_indices[0], x2_indptr=Y_indptr, - size=size, + size=n_features, D=Darr, ) return Darr @@ -730,14 +732,14 @@ cdef class DistanceMetric{{name_suffix}}: # have come with an extra copy to ensure C-contiguity of the result. cdef: ITYPE_t n_X = X.shape[0] - ITYPE_t size = X.shape[1] + ITYPE_t n_features = X.shape[1] # To understand the CSR format used here, see the comment in # `SparseDenseDatasetsPair.__init__` const {{INPUT_DTYPE_t}}[:] X_data = X.reshape(-1) - const SPARSE_INDEX_TYPE_t[:] X_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) + const SPARSE_INDEX_TYPE_t[:] X_indices = np.arange(n_features, dtype=SPARSE_INDEX_TYPE) const SPARSE_INDEX_TYPE_t[:] X_indptr = np.arange( - start=0, stop=size * (n_X + 1), step=size, dtype=SPARSE_INDEX_TYPE + start=0, stop=n_features * (n_X + 1), step=n_features, dtype=SPARSE_INDEX_TYPE ) ITYPE_t n_Y = Y.shape[0] const {{INPUT_DTYPE_t}}[:] Y_data = np.asarray(Y.data, dtype={{INPUT_DTYPE}}) @@ -753,7 +755,7 @@ cdef class DistanceMetric{{name_suffix}}: x2_data=Y_data, x2_indices=&Y_indices[0], x2_indptr=Y_indptr, - size=size, + size=n_features, D=Darr, ) return Darr diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx index 8d455e51b14e9..372769cafc942 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx @@ -309,7 +309,7 @@ cdef class SparseDenseDatasetsPair(DatasetsPair): cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: cdef: ITYPE_t x2_start = j * self.n_features - ITYPE_t x2_end = (j + 1) * self.n_features + ITYPE_t x2_end = x2_start + self.n_features return self.distance_metric.rdist_csr( x1_data=self.X_data, @@ -317,7 +317,8 @@ cdef class SparseDenseDatasetsPair(DatasetsPair): x2_data=self.Y_data, # To use the same indices array, we shift the array address to map # accesses in [x2_start, x2_end) to - # accesses in [0, x2_end - x2_start) == [0, n_features). + # accesses in [x2_start - x2_start, x2_end - x2_start), that is + # accesses in [0, n_features). x2_indices=&self.Y_indices[0] - x2_start, x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], @@ -330,15 +331,16 @@ cdef class SparseDenseDatasetsPair(DatasetsPair): cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: cdef: ITYPE_t x2_start = j * self.n_features - ITYPE_t x2_end = (j + 1) * self.n_features + ITYPE_t x2_end = x2_start + self.n_features return self.distance_metric.dist_csr( x1_data=self.X_data, x1_indices=&self.X_indices[0], x2_data=self.Y_data, - # To use the same `indices` array, we shift the array address to map + # To use the same indices array, we shift the array address to map # accesses in [x2_start, x2_end) to - # accesses in [0, x2_end - x2_start) == [0, n_features). + # accesses in [x2_start - x2_start, x2_end - x2_start), that is + # accesses in [0, n_features). x2_indices=&self.Y_indices[0] - x2_start, x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], From de5237119b74d381b94c9f1a9e0fc6e6310822e0 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 11 Aug 2022 09:09:10 +0200 Subject: [PATCH 40/68] TST `dtype`-parametrize `test_format_agnosticism` --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 843c57287a57b..788972429f644 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -784,13 +784,14 @@ def test_n_threads_agnosticism( "PairwiseDistancesReduction", [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], ) +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) def test_format_agnosticism( global_random_seed, PairwiseDistancesReduction, n_samples, chunk_size, n_features=100, - dtype=np.float64, + dtype, ): # Results must not depend on the number of threads rng = np.random.RandomState(global_random_seed) From 324315358abe05cef907789135ec183db5061bf8 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 11 Aug 2022 10:13:04 +0200 Subject: [PATCH 41/68] fixup! TST `dtype`-parametrize `test_format_agnosticism` --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 788972429f644..9c6203e227446 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -790,8 +790,8 @@ def test_format_agnosticism( PairwiseDistancesReduction, n_samples, chunk_size, - n_features=100, dtype, + n_features=100, ): # Results must not depend on the number of threads rng = np.random.RandomState(global_random_seed) From e5219923a492ce20dcecbd0a48e1124b3788edb9 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 11 Aug 2022 11:32:52 -0400 Subject: [PATCH 42/68] MNT Pushing data up instead of indices --- sklearn/metrics/_dist_metrics.pxd.tp | 14 +- sklearn/metrics/_dist_metrics.pyx.tp | 132 +++++++++--------- .../_datasets_pair.pyx.tp | 47 +++---- 3 files changed, 91 insertions(+), 102 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index c7fe34868f329..d21a6387428de 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -101,9 +101,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -114,9 +114,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t rdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -140,7 +140,7 @@ cdef class DistanceMetric{{name_suffix}}: cdef int pdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, @@ -149,10 +149,10 @@ cdef class DistanceMetric{{name_suffix}}: cdef int cdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 4745ea3e0bf55..9b96538be3280 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -395,9 +395,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -425,9 +425,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, ) nogil except -1: @@ -461,9 +461,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t rdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -502,7 +502,7 @@ cdef class DistanceMetric{{name_suffix}}: cdef int pdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, @@ -539,10 +539,10 @@ cdef class DistanceMetric{{name_suffix}}: cdef int cdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, @@ -663,7 +663,7 @@ cdef class DistanceMetric{{name_suffix}}: if X is Y: Darr = np.empty((n_X, n_X), dtype=DTYPE, order='C') self.pdist_csr( - x1_data=X_data, + x1_data=&X_data[0], x1_indices=&X_indices[0], x1_indptr=X_indptr, size=n_features, @@ -678,10 +678,10 @@ cdef class DistanceMetric{{name_suffix}}: Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.cdist_csr( - x1_data=X_data, + x1_data=&X_data[0], x1_indices=&X_indices[0], x1_indptr=X_indptr, - x2_data=Y_data, + x2_data=&Y_data[0], x2_indices=&Y_indices[0], x2_indptr=Y_indptr, size=n_features, @@ -712,10 +712,10 @@ cdef class DistanceMetric{{name_suffix}}: DTYPE_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.cdist_csr( - x1_data=X_data, + x1_data=&X_data[0], x1_indices=&X_indices[0], x1_indptr=X_indptr, - x2_data=Y_data, + x2_data=&Y_data[0], x2_indices=&Y_indices[0], x2_indptr=Y_indptr, size=n_features, @@ -749,10 +749,10 @@ cdef class DistanceMetric{{name_suffix}}: DTYPE_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.cdist_csr( - x1_data=X_data, + x1_data=&X_data[0], x1_indices=&X_indices[0], x1_indptr=X_indptr, - x2_data=Y_data, + x2_data=&Y_data[0], x2_indices=&Y_indices[0], x2_indptr=Y_indptr, size=n_features, @@ -839,9 +839,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -891,9 +891,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -967,9 +967,9 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1020,9 +1020,9 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1069,9 +1069,9 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1153,9 +1153,9 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1290,9 +1290,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1370,9 +1370,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1474,9 +1474,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1524,9 +1524,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1629,9 +1629,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1683,9 +1683,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1734,9 +1734,9 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1809,9 +1809,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1886,9 +1886,9 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1970,9 +1970,9 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2053,9 +2053,9 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2129,9 +2129,9 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2210,9 +2210,9 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2288,9 +2288,9 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2365,9 +2365,9 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2435,9 +2435,9 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2513,9 +2513,9 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2617,9 +2617,9 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2641,9 +2641,9 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, - const {{INPUT_DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}* x1_data, const SPARSE_INDEX_TYPE_t* x1_indices, - const {{INPUT_DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}* x2_data, const SPARSE_INDEX_TYPE_t* x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 6966a030bf783..87012029e4dd6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -230,9 +230,9 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): @final cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: return self.distance_metric.rdist_csr( - x1_data=self.X_data, + x1_data=&self.X_data[0], x1_indices=&self.X_indices[0], - x2_data=self.Y_data, + x2_data=&self.Y_data[0], x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], @@ -244,9 +244,9 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): @final cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: return self.distance_metric.dist_csr( - x1_data=self.X_data, + x1_data=&self.X_data[0], x1_indices=&self.X_indices[0], - x2_data=self.Y_data, + x2_data=&self.Y_data[0], x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], @@ -314,45 +314,34 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): @final cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: - cdef: - ITYPE_t x2_start = j * self.n_features - ITYPE_t x2_end = x2_start + self.n_features - return self.distance_metric.rdist_csr( - x1_data=self.X_data, + x1_data=&self.X_data[0], x1_indices=&self.X_indices[0], - x2_data=self.Y_data, - # To use the same indices array, we shift the array address to map - # accesses in [x2_start, x2_end) to - # accesses in [x2_start - x2_start, x2_end - x2_start), that is - # accesses in [0, n_features). - x2_indices=&self.Y_indices[0] - x2_start, + # Push the data pointer forward such that x2_start=0 is alined with the + # j-th row + x2_data=&self.Y_data[0] + j * self.n_features, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], - x2_start=x2_start, - x2_end=x2_end, + x2_start=0, + x2_end=self.n_features, size=self.n_features, ) @final cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: - cdef: - ITYPE_t x2_start = j * self.n_features - ITYPE_t x2_end = x2_start + self.n_features return self.distance_metric.dist_csr( - x1_data=self.X_data, + x1_data=&self.X_data[0], x1_indices=&self.X_indices[0], - x2_data=self.Y_data, - # To use the same indices array, we shift the array address to map - # accesses in [x2_start, x2_end) to - # accesses in [x2_start - x2_start, x2_end - x2_start), that is - # accesses in [0, n_features). - x2_indices=&self.Y_indices[0] - x2_start, + # Push the data pointer forward such that x2_start=0 is alined with the + # j-th row + x2_data=&self.Y_data[0] + j * self.n_features, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], - x2_start=x2_start, - x2_end=x2_end, + x2_start=0, + x2_end=self.n_features, size=self.n_features, ) From 972fff94d47269d8ec22b3c22f02e30583b9d681 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 11 Aug 2022 12:57:09 -0400 Subject: [PATCH 43/68] DOC Improve comment --- .../_pairwise_distances_reduction/_datasets_pair.pyx.tp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 87012029e4dd6..8234c94c79280 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -317,7 +317,7 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.distance_metric.rdist_csr( x1_data=&self.X_data[0], x1_indices=&self.X_indices[0], - # Push the data pointer forward such that x2_start=0 is alined with the + # Increment the data pointer such that x2_start=0 is alined with the # j-th row x2_data=&self.Y_data[0] + j * self.n_features, x2_indices=&self.Y_indices[0], @@ -334,7 +334,7 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.distance_metric.dist_csr( x1_data=&self.X_data[0], x1_indices=&self.X_indices[0], - # Push the data pointer forward such that x2_start=0 is alined with the + # Increment the data pointer such that x2_start=0 is alined with the # j-th row x2_data=&self.Y_data[0] + j * self.n_features, x2_indices=&self.Y_indices[0], From 3086c0b492bff75f15c126eac7f3cea22ed9a8bf Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 11 Aug 2022 13:06:24 -0400 Subject: [PATCH 44/68] REV Revert back to memoryviews for indices --- sklearn/metrics/_dist_metrics.pxd.tp | 14 +- sklearn/metrics/_dist_metrics.pyx.tp | 132 +++++++++--------- .../_datasets_pair.pyx.tp | 16 +-- 3 files changed, 81 insertions(+), 81 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index d21a6387428de..e0e67758f5023 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -102,9 +102,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -115,9 +115,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -141,7 +141,7 @@ cdef class DistanceMetric{{name_suffix}}: cdef int pdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, @@ -150,10 +150,10 @@ cdef class DistanceMetric{{name_suffix}}: cdef int cdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 9b96538be3280..d75c80896fe18 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -396,9 +396,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -426,9 +426,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, ) nogil except -1: Where callees would use slicing on the original CSR data and indices @@ -462,9 +462,9 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -503,7 +503,7 @@ cdef class DistanceMetric{{name_suffix}}: cdef int pdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, @@ -540,10 +540,10 @@ cdef class DistanceMetric{{name_suffix}}: cdef int cdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, @@ -664,7 +664,7 @@ cdef class DistanceMetric{{name_suffix}}: Darr = np.empty((n_X, n_X), dtype=DTYPE, order='C') self.pdist_csr( x1_data=&X_data[0], - x1_indices=&X_indices[0], + x1_indices=X_indices, x1_indptr=X_indptr, size=n_features, D=Darr, @@ -679,10 +679,10 @@ cdef class DistanceMetric{{name_suffix}}: Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.cdist_csr( x1_data=&X_data[0], - x1_indices=&X_indices[0], + x1_indices=X_indices, x1_indptr=X_indptr, x2_data=&Y_data[0], - x2_indices=&Y_indices[0], + x2_indices=Y_indices, x2_indptr=Y_indptr, size=n_features, D=Darr, @@ -713,10 +713,10 @@ cdef class DistanceMetric{{name_suffix}}: self.cdist_csr( x1_data=&X_data[0], - x1_indices=&X_indices[0], + x1_indices=X_indices, x1_indptr=X_indptr, x2_data=&Y_data[0], - x2_indices=&Y_indices[0], + x2_indices=Y_indices, x2_indptr=Y_indptr, size=n_features, D=Darr, @@ -750,10 +750,10 @@ cdef class DistanceMetric{{name_suffix}}: self.cdist_csr( x1_data=&X_data[0], - x1_indices=&X_indices[0], + x1_indices=X_indices, x1_indptr=X_indptr, x2_data=&Y_data[0], - x2_indices=&Y_indices[0], + x2_indices=Y_indices, x2_indptr=Y_indptr, size=n_features, D=Darr, @@ -840,9 +840,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -892,9 +892,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -968,9 +968,9 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1021,9 +1021,9 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1070,9 +1070,9 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1154,9 +1154,9 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1291,9 +1291,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1371,9 +1371,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1475,9 +1475,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1525,9 +1525,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1630,9 +1630,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1684,9 +1684,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1735,9 +1735,9 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1810,9 +1810,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1887,9 +1887,9 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -1971,9 +1971,9 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2054,9 +2054,9 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2130,9 +2130,9 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2211,9 +2211,9 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2289,9 +2289,9 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2366,9 +2366,9 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2436,9 +2436,9 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2514,9 +2514,9 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2618,9 +2618,9 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, @@ -2642,9 +2642,9 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const SPARSE_INDEX_TYPE_t* x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const SPARSE_INDEX_TYPE_t* x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, const SPARSE_INDEX_TYPE_t x2_start, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 8234c94c79280..622d083c85443 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -231,9 +231,9 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: return self.distance_metric.rdist_csr( x1_data=&self.X_data[0], - x1_indices=&self.X_indices[0], + x1_indices=self.X_indices, x2_data=&self.Y_data[0], - x2_indices=&self.Y_indices[0], + x2_indices=self.Y_indices, x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=self.Y_indptr[j], @@ -245,9 +245,9 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: return self.distance_metric.dist_csr( x1_data=&self.X_data[0], - x1_indices=&self.X_indices[0], + x1_indices=self.X_indices, x2_data=&self.Y_data[0], - x2_indices=&self.Y_indices[0], + x2_indices=self.Y_indices, x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=self.Y_indptr[j], @@ -316,11 +316,11 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: return self.distance_metric.rdist_csr( x1_data=&self.X_data[0], - x1_indices=&self.X_indices[0], + x1_indices=self.X_indices, # Increment the data pointer such that x2_start=0 is alined with the # j-th row x2_data=&self.Y_data[0] + j * self.n_features, - x2_indices=&self.Y_indices[0], + x2_indices=self.Y_indices, x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=0, @@ -333,11 +333,11 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.distance_metric.dist_csr( x1_data=&self.X_data[0], - x1_indices=&self.X_indices[0], + x1_indices=self.X_indices, # Increment the data pointer such that x2_start=0 is alined with the # j-th row x2_data=&self.Y_data[0] + j * self.n_features, - x2_indices=&self.Y_indices[0], + x2_indices=self.Y_indices, x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=0, From afa0c35d59b796cbf20bc3a23cf2be428f9201a0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 11 Aug 2022 13:13:14 -0400 Subject: [PATCH 45/68] DOC Spelling --- .../_pairwise_distances_reduction/_datasets_pair.pyx.tp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 622d083c85443..8aa922a1e6665 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -317,7 +317,7 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.distance_metric.rdist_csr( x1_data=&self.X_data[0], x1_indices=self.X_indices, - # Increment the data pointer such that x2_start=0 is alined with the + # Increment the data pointer such that x2_start=0 is aligned with the # j-th row x2_data=&self.Y_data[0] + j * self.n_features, x2_indices=self.Y_indices, @@ -334,7 +334,7 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.distance_metric.dist_csr( x1_data=&self.X_data[0], x1_indices=self.X_indices, - # Increment the data pointer such that x2_start=0 is alined with the + # Increment the data pointer such that x2_start=0 is aligned with the # j-th row x2_data=&self.Y_data[0] + j * self.n_features, x2_indices=self.Y_indices, From be59297c0da9f2d7c09f98f6bea406cecbda90f7 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 22 Aug 2022 10:52:28 +0200 Subject: [PATCH 46/68] TST Suggest logic adaptation for _pairwise_{dense_sparse,sparse_dense} This fixes those two methods used for testing which were using the previous trick. This allocate potentially large `indices` arrays. This also add missing kwargs for some others tests. --- sklearn/metrics/_dist_metrics.pyx.tp | 34 ++++++++++++------- sklearn/metrics/tests/test_dist_metrics.py | 16 ++++++--- .../test_pairwise_distances_reduction.py | 6 ++++ 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index d75c80896fe18..9802ae504a28a 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -639,7 +639,7 @@ cdef class DistanceMetric{{name_suffix}}: self._validate_data(Yarr) Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype=DTYPE, order='C') self.cdist(Xarr, Yarr, Darr) - return Darr + return np.asarray(Darr) def _pairwise_sparse_sparse(self, X: csr_matrix , Y: csr_matrix): cdef: @@ -687,7 +687,7 @@ cdef class DistanceMetric{{name_suffix}}: size=n_features, D=Darr, ) - return Darr + return np.asarray(Darr) def _pairwise_sparse_dense(self, X: csr_matrix, @@ -700,11 +700,14 @@ cdef class DistanceMetric{{name_suffix}}: const SPARSE_INDEX_TYPE_t[:] X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) const SPARSE_INDEX_TYPE_t[:] X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) - # To understand the CSR format used here, see the comment in - # `SparseDenseDatasetsPair.__init__`. ITYPE_t n_Y = Y.shape[0] - const {{INPUT_DTYPE_t}}[:] Y_data = Y.reshape(-1) - const SPARSE_INDEX_TYPE_t[:] Y_indices = np.arange(n_features, dtype=SPARSE_INDEX_TYPE) + const {{INPUT_DTYPE_t}}[:] Y_data = np.reshape(Y, -1) + + # Potentially costly to store, but allows using `cdist_csr` + # without logic duplication. + const SPARSE_INDEX_TYPE_t[:] Y_indices = np.tile( + np.arange(n_features, dtype=SPARSE_INDEX_TYPE), reps=n_Y, + ) const SPARSE_INDEX_TYPE_t[:] Y_indptr = np.arange( start=0, stop=n_features * (n_X + 1), step=n_features, dtype=SPARSE_INDEX_TYPE ) @@ -721,7 +724,7 @@ cdef class DistanceMetric{{name_suffix}}: size=n_features, D=Darr, ) - return Darr + return np.asarray(Darr) def _pairwise_dense_sparse(self, const {{INPUT_DTYPE_t}}[:, ::1] X, @@ -734,10 +737,13 @@ cdef class DistanceMetric{{name_suffix}}: ITYPE_t n_X = X.shape[0] ITYPE_t n_features = X.shape[1] - # To understand the CSR format used here, see the comment in - # `SparseDenseDatasetsPair.__init__` - const {{INPUT_DTYPE_t}}[:] X_data = X.reshape(-1) - const SPARSE_INDEX_TYPE_t[:] X_indices = np.arange(n_features, dtype=SPARSE_INDEX_TYPE) + const {{INPUT_DTYPE_t}}[:] X_data = np.reshape(X, -1) + + # Potentially costly to store, but allows using `cdist_csr` + # without logic duplication. + const SPARSE_INDEX_TYPE_t[:] X_indices = np.tile( + np.arange(n_features, dtype=SPARSE_INDEX_TYPE), reps=n_X, + ) const SPARSE_INDEX_TYPE_t[:] X_indptr = np.arange( start=0, stop=n_features * (n_X + 1), step=n_features, dtype=SPARSE_INDEX_TYPE ) @@ -758,7 +764,7 @@ cdef class DistanceMetric{{name_suffix}}: size=n_features, D=Darr, ) - return Darr + return np.asarray(Darr) def pairwise(self, X, Y=None): @@ -792,11 +798,15 @@ cdef class DistanceMetric{{name_suffix}}: Y_is_sparse = issparse(Y) if not X_is_sparse and not Y_is_sparse: + X = np.asarray(X, order='C') + Y = np.asarray(Y, order='C') return self._pairwise_dense_dense(X, Y) if X_is_sparse and Y_is_sparse: return self._pairwise_sparse_sparse(X, Y) if X_is_sparse and not Y_is_sparse: + Y = np.asarray(Y, order='C') return self._pairwise_sparse_dense(X, Y) + X = np.asarray(X, order='C') return self._pairwise_dense_sparse(X, Y) #------------------------------------------------------------ diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index e11be4dab3e20..bb95681ebc90e 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -73,7 +73,9 @@ def dist_func(x1, x2, p): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") -@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) +@pytest.mark.parametrize( + "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0] +) @pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)]) def test_cdist(metric_param_grid, X, Y): DistanceMetricInterface = ( @@ -158,7 +160,9 @@ def test_cdist_bool_metric(metric, X_bool, Y_bool): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") -@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) +@pytest.mark.parametrize( + "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0] +) @pytest.mark.parametrize("X", [X64, X32, X_mmap]) def test_pdist(metric_param_grid, X): DistanceMetricInterface = ( @@ -207,7 +211,9 @@ def test_pdist(metric_param_grid, X): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") -@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) +@pytest.mark.parametrize( + "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0] +) def test_distance_metrics_dtype_consistency(metric_param_grid): # DistanceMetric must return similar distances for both float32 and float64 # input data. @@ -258,7 +264,9 @@ def test_pdist_bool_metrics(metric, X_bool): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("writable_kwargs", [True, False]) -@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) +@pytest.mark.parametrize( + "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0] +) @pytest.mark.parametrize("X", [X64, X32]) def test_pickle(writable_kwargs, metric_param_grid, X): DistanceMetricInterface = ( diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 9c6203e227446..dfd96e6cc63fb 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -805,11 +805,13 @@ def test_format_agnosticism( if PairwiseDistancesReduction is PairwiseDistancesArgKmin: parameter = 10 check_parameters = {} + compute_parameters = {} else: # Scaling the radius slightly with the numbers of dimensions radius = 10 ** np.log(n_features) parameter = radius check_parameters = {"radius": radius} + compute_parameters = {"sort_results": True} # XXX: use itertools.pairwise when available? dist_dense_dense, indices_dense_dense = PairwiseDistancesReduction.compute( @@ -817,6 +819,7 @@ def test_format_agnosticism( Y, parameter, return_distance=True, + **compute_parameters, ) dist_sparse_sparse, indices_sparse_sparse = PairwiseDistancesReduction.compute( @@ -824,6 +827,7 @@ def test_format_agnosticism( Y_csr, parameter, return_distance=True, + **compute_parameters, ) ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( @@ -839,6 +843,7 @@ def test_format_agnosticism( Y_csr, parameter, return_distance=True, + **compute_parameters, ) ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( @@ -854,6 +859,7 @@ def test_format_agnosticism( Y, parameter, return_distance=True, + **compute_parameters, ) ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( From ec1d4f97bed66d08112a91e7b6ea0b18ad233496 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 25 Aug 2022 17:29:44 +0200 Subject: [PATCH 47/68] DOC Add co-authors in `whats_new` entry Co-authored-by: Olivier Grisel Co-authored-by: Thomas J. Fan --- doc/whats_new/v1.2.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 2a8a3d49dc5bb..55867300db3ea 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -69,7 +69,9 @@ Changes impacting all modules - :class:`sklearn.manifold.TSNE` - :func:`sklearn.manifold.trustworthiness` - :pr:`23585` by `Julien Jerphanion ` + :pr:`23604` and :pr:`23585` by `Julien Jerphanion `, + `Olivier Grisel `, and `Thomas Fan`_. + Changelog --------- From fbf311e4e348a603153ac746569e1887e5b741e5 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 26 Aug 2022 11:11:14 +0200 Subject: [PATCH 48/68] Do not support CSR matrices without non-zero elements --- .../metrics/_pairwise_distances_reduction/_dispatcher.py | 8 ++++++-- .../metrics/tests/test_pairwise_distances_reduction.py | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index e57244f4ecf3b..9830777bafa2c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -98,10 +98,14 @@ def is_usable_for(cls, X, Y, metric) -> bool: def is_numpy_c_ordered(X): return hasattr(X, "flags") and X.flags.c_contiguous + def is_valid_sparse_matrix(X): + # TODO: support CSR matrices without non-zeros elements + return isspmatrix_csr(X) and X.nnz > 0 + return ( get_config().get("enable_cython_pairwise_dist", True) - and (is_numpy_c_ordered(X) or isspmatrix_csr(X)) - and (is_numpy_c_ordered(Y) or isspmatrix_csr(Y)) + and (is_numpy_c_ordered(X) or is_valid_sparse_matrix(X)) + and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y)) and X.dtype == Y.dtype and X.dtype in (np.float32, np.float64) and metric in cls.valid_metrics() diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index cbf7cfeeb0afc..13b9f87e1f982 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -545,6 +545,11 @@ def test_pairwise_distances_reduction_is_usable_for(): # F-ordered arrays are not supported assert not PairwiseDistancesReduction.is_usable_for(np.asfortranarray(X), Y, metric) + # CSR matrices without non-zeros elements aren't currently supported + # TODO: support CSR matrices without non-zeros elements + X_csr_0_nnz = csr_matrix(X * 0) + assert not PairwiseDistancesReduction.is_usable_for(X_csr_0_nnz, Y, metric) + def test_argkmin_factory_method_wrong_usages(): rng = np.random.RandomState(1) From 8fddffdba9fe2473ae9307cb451d440496c17278 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Sun, 28 Aug 2022 17:00:43 +0200 Subject: [PATCH 49/68] fixup! Merge branch 'main' into maint/pdr-sparse-support --- .../test_pairwise_distances_reduction.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 78c4a0262521a..55d83f4adc26b 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -782,13 +782,13 @@ def test_n_threads_agnosticism( @pytest.mark.parametrize("n_samples", [100, 1000]) @pytest.mark.parametrize("chunk_size", [50, 512, 1024]) @pytest.mark.parametrize( - "PairwiseDistancesReduction", + "Dispatcher", [ArgKmin, RadiusNeighbors], ) @pytest.mark.parametrize("dtype", [np.float64, np.float32]) def test_format_agnosticism( global_random_seed, - PairwiseDistancesReduction, + Dispatcher, n_samples, chunk_size, dtype, @@ -803,7 +803,7 @@ def test_format_agnosticism( X_csr = csr_matrix(X) Y_csr = csr_matrix(Y) - if PairwiseDistancesReduction is ArgKmin: + if Dispatcher is ArgKmin: parameter = 10 check_parameters = {} compute_parameters = {} @@ -815,7 +815,7 @@ def test_format_agnosticism( compute_parameters = {"sort_results": True} # XXX: use itertools.pairwise when available? - dist_dense_dense, indices_dense_dense = PairwiseDistancesReduction.compute( + dist_dense_dense, indices_dense_dense = Dispatcher.compute( X, Y, parameter, @@ -823,7 +823,7 @@ def test_format_agnosticism( **compute_parameters, ) - dist_sparse_sparse, indices_sparse_sparse = PairwiseDistancesReduction.compute( + dist_sparse_sparse, indices_sparse_sparse = Dispatcher.compute( X_csr, Y_csr, parameter, @@ -831,7 +831,7 @@ def test_format_agnosticism( **compute_parameters, ) - ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( + ASSERT_RESULT[(Dispatcher, dtype)]( dist_dense_dense, dist_sparse_sparse, indices_dense_dense, @@ -839,7 +839,7 @@ def test_format_agnosticism( **check_parameters, ) - dist_dense_sparse, indices_dense_sparse = PairwiseDistancesReduction.compute( + dist_dense_sparse, indices_dense_sparse = Dispatcher.compute( X, Y_csr, parameter, @@ -847,7 +847,7 @@ def test_format_agnosticism( **compute_parameters, ) - ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( + ASSERT_RESULT[(Dispatcher, dtype)]( dist_dense_dense, dist_dense_sparse, indices_dense_dense, @@ -855,7 +855,7 @@ def test_format_agnosticism( **check_parameters, ) - dist_sparse_dense, indices_sparse_dense = PairwiseDistancesReduction.compute( + dist_sparse_dense, indices_sparse_dense = Dispatcher.compute( X_csr, Y, parameter, @@ -863,7 +863,7 @@ def test_format_agnosticism( **compute_parameters, ) - ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( + ASSERT_RESULT[(Dispatcher, dtype)]( dist_dense_dense, dist_sparse_dense, indices_dense_dense, From 4b879f1865c0ebf3362159665ef879e06b5466f9 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 29 Aug 2022 14:39:30 +0200 Subject: [PATCH 50/68] MAINT Do not pop Y_norm_squared when unused --- .../_pairwise_distances_reduction/_argkmin.pyx.tp | 10 ++-------- .../_radius_neighborhood.pyx.tp | 13 +++++-------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index bd9f8d8a96b6f..1c1459e27f210 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -98,14 +98,8 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): metric_kwargs=metric_kwargs, ) else: - # Fall back on a generic implementation that handles all distance - # metrics by computing it between 2 vectors at a time. - - # The extra `Y_norm_squared` argument for the back-end is only - # supported for the Euclidean variant. - if metric_kwargs is not None: - metric_kwargs.pop("Y_norm_squared", None) - + # Fall back on a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. pda = ArgKmin{{name_suffix}}( datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), k=k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index 9be93151d722c..616e8b76f88df 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -122,18 +122,13 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): sort_results=sort_results, ) else: - # Fall back on a generic implementation that handles all distance - # metrics by computing it between 2 vectors at a time. - - # The extra `Y_norm_squared` argument for the back-end is only - # supported for the Euclidean variant. - if metric_kwargs is not None: - metric_kwargs.pop("Y_norm_squared", None) - + # Fall back on a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. pda = RadiusNeighbors{{name_suffix}}( datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), radius=radius, chunk_size=chunk_size, + metric_kwargs=metric_kwargs, strategy=strategy, sort_results=sort_results, ) @@ -156,6 +151,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): chunk_size=None, strategy=None, sort_results=False, + metric_kwargs=None, ): super().__init__( datasets_pair=datasets_pair, @@ -378,6 +374,7 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} chunk_size=chunk_size, strategy=strategy, sort_results=sort_results, + metric_kwargs=metric_kwargs, ) # X and Y are checked by the DatasetsPair{{name_suffix}} implemented # as a DenseDenseDatasetsPair{{name_suffix}} From ca492361bb3851bfc4ab363327a97713179126b6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 9 Sep 2022 09:23:02 +0200 Subject: [PATCH 51/68] Explicitly do not support CSR matrices with int64 indices and indptr See: https://github.com/scikit-learn/scikit-learn/issues/23653 --- .../_pairwise_distances_reduction/_dispatcher.py | 12 ++++++++++-- .../tests/test_pairwise_distances_reduction.py | 8 ++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 0a152b1461652..7b4cc94306494 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -99,8 +99,16 @@ def is_numpy_c_ordered(X): return hasattr(X, "flags") and X.flags.c_contiguous def is_valid_sparse_matrix(X): - # TODO: support CSR matrices without non-zeros elements - return isspmatrix_csr(X) and X.nnz > 0 + return ( + isspmatrix_csr(X) + and + # TODO: support CSR matrices without non-zeros elements + X.nnz > 0 + and + # TODO: support CSR matrices with int64 indices and indptr + # See: https://github.com/scikit-learn/scikit-learn/issues/23653 + X.indices.dtype == X.indptr.dtype == np.int32 + ) return ( get_config().get("enable_cython_pairwise_dist", True) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 55d83f4adc26b..7302e3369096d 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -556,6 +556,14 @@ def test_pairwise_distances_reduction_is_usable_for(): X_csr_0_nnz = csr_matrix(X * 0) assert not BaseDistanceReductionDispatcher.is_usable_for(X_csr_0_nnz, Y, metric) + # CSR matrices with int64 indices and indptr (e.g. large nnz, or large n_features) + # aren't supported as of now. + # See: https://github.com/scikit-learn/scikit-learn/issues/23653 + # TODO: support CSR matrices with int64 indices and indptr + X_csr_int64 = csr_matrix(X) + X_csr_int64.indices = X_csr_int64.indices.astype(np.int64) + assert not BaseDistanceReductionDispatcher.is_usable_for(X_csr_int64, Y, metric) + def test_argkmin_factory_method_wrong_usages(): rng = np.random.RandomState(1) From d7b3649ba135f7c6d06b28c69caa0feee1118bd4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 9 Sep 2022 09:50:46 +0200 Subject: [PATCH 52/68] DOC Update and improve comment for the alternative CSR representation --- .../_datasets_pair.pyx.tp | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 183f52c569013..fa156e059fb89 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -91,11 +91,11 @@ cdef class DatasetsPair{{name_suffix}}: datasets_pair: DatasetsPair{{name_suffix}} The suited DatasetsPair{{name_suffix}} implementation. """ - # Y_norm_squared might be propagated down to DatasetsPairs - # via metrics_kwargs when the Euclidean specialisations - # can't be used. To prevent Y_norm_squared to be passed - # down to DistanceMetrics (whose constructors would raise - # a RuntimeError), we pop it here. + # Y_norm_squared might be propagated down to DatasetsPairs + # via metrics_kwargs when the Euclidean specialisations + # can't be used. To prevent Y_norm_squared to be passed + # down to DistanceMetrics (whose constructors would raise + # a RuntimeError), we pop it here. if metric_kwargs is not None: metric_kwargs.pop("Y_norm_squared", None) cdef: @@ -284,26 +284,32 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) - # We support the sparse-dense case by using the sparse-sparse case to avoid - # introducing a new complex set of interfaces. + # We support the sparse-dense case by using the sparse-sparse interfaces + # of `DistanceMetric` (namely `DistanceMetric.{dist_csr,rdist_csr}`) to + # avoid introducing a new complex set of interfaces. In this case, we + # need to convert `Y`, the dense array into a CSR matrix. Here we motive + # using another simpler CSR representation for `Y`. # - # To do so, we use an alternative CSR datastructure is used where: - # - the `data` array is the original dense array, `Y` - # - the `indices` array is a single row of n_features elements: + # Indeed, if we were to use the usual CSR representation for `Y`, + # storing all the columns indices in `indices` would have required + # allocating an array of n_samples × n_features elements with repeated + # contiguous integers from 0 to n_features - 1. This would have been + # very wasteful from a memory point of view. This alternative + # representation just uses the necessary amount of information needed + # and only necessitates shifting the address of `data` before calling + # the CSR × CSR routines. In this representation: # - # [0, 1, ..., n_features-1] + # - the `data` array is the original dense array, `Y`, whose first + # element's address is shifted before calling the CSR × CSR routine # - # - the `indptr` array is a single row of n_X + 1 elements: + # - the `indices` array is a single row of `n_features` elements: # - # [0, n_features, 2 * n_features, ..., n_features * n_X] + # [0, 1, ..., n_features-1] # - # If we were to use the natural CSR representation for the dense array, - # the indices would have required allocating an array of - # n_samples × n_features elements with repeated contiguous integers from - # 0 to n_features - 1, and this would have been very wasteful from a memory - # point of view. This alternative representation just uses the necessary - # amount of information needed and only necessitates shifting - # the address of `indices` before calling the CSR × CSR routines. + # - the `indptr` array is not materialised as the indices pointers' + # offset is constant (the offset equals `n_features`). Moreover, as + # `data` is shifted, constant `start` and `end` indices pointers + # respectively equalling 0 and n_features are used. # Y array already has been checked here self.n_Y = Y.shape[0] From 5e13663aabf245b98786a214a2f63804547a822c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 9 Sep 2022 10:02:08 +0200 Subject: [PATCH 53/68] fixup! DOC Update and improve comment for the alternative CSR representation --- .../_datasets_pair.pyx.tp | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index fa156e059fb89..2705952c73d87 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -287,17 +287,19 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): # We support the sparse-dense case by using the sparse-sparse interfaces # of `DistanceMetric` (namely `DistanceMetric.{dist_csr,rdist_csr}`) to # avoid introducing a new complex set of interfaces. In this case, we - # need to convert `Y`, the dense array into a CSR matrix. Here we motive - # using another simpler CSR representation for `Y`. + # need to convert `Y` (the dense array) into a CSR matrix. # - # Indeed, if we were to use the usual CSR representation for `Y`, - # storing all the columns indices in `indices` would have required - # allocating an array of n_samples × n_features elements with repeated - # contiguous integers from 0 to n_features - 1. This would have been - # very wasteful from a memory point of view. This alternative - # representation just uses the necessary amount of information needed - # and only necessitates shifting the address of `data` before calling - # the CSR × CSR routines. In this representation: + # Here we motive using another simpler CSR representation to use for `Y`. + # + # If we were to use the usual CSR representation for `Y`, storing all + # the columns indices in `indices` would have required allocating an + # array of n_samples × n_features elements with repeated contiguous + # integers from 0 to n_features - 1. This would have been very wasteful + # from a memory point of view. This alternative representation just uses + # the necessary amount of information needed and only necessitates + # shifting the address of `data` before calling the CSR × CSR routines. + # + # In this representation: # # - the `data` array is the original dense array, `Y`, whose first # element's address is shifted before calling the CSR × CSR routine From 1de8acbcbcbef27b43b9bddf229b7f0b39f75113 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 9 Sep 2022 13:50:43 +0200 Subject: [PATCH 54/68] CI Retrigger CI due to faulty runs From 8f43a5a259bfdb403ee3336cd8f89e6145abe96e Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Sep 2022 10:21:27 +0200 Subject: [PATCH 55/68] DOC Update whats_new entry --- doc/whats_new/v1.2.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 343e3f163a83e..c6a2559a45945 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -52,7 +52,8 @@ Changes impacting all modules :pr:`23197` by :user:`Meekail Zain ` - |Enhancement| Support for combinations of dense and sparse datasets pairs - for all distance metrics has been added on the following estimators: + for all distance metrics and for float32 and float64 datasets has been added + or has seen its performance improved for the following estimators: - :func:`sklearn.metrics.pairwise_distances_argmin` - :func:`sklearn.metrics.pairwise_distances_argmin_min` @@ -69,8 +70,8 @@ Changes impacting all modules - :class:`sklearn.manifold.TSNE` - :func:`sklearn.manifold.trustworthiness` - :pr:`23604` and :pr:`23585` by `Julien Jerphanion `, - `Olivier Grisel `, and `Thomas Fan`_. + :pr:`23604` and :pr:`23585` by :user:`Julien Jerphanion `, + :user:`Olivier Grisel `, and `Thomas Fan`_. Changelog @@ -327,7 +328,7 @@ Changelog - |Fix| Allows `csr_matrix` as input for parameter: `y_true` of the :func:`metrics.label_ranking_average_precision_score` metric. :pr:`23442` by :user:`Sean Atukorala ` - + - |Fix| :func:`metrics.ndcg_score` will now trigger a warning when the `y_true` value contains a negative value. Users may still use negative values, but the result may not be between 0 and 1. Starting in v1.4, passing in negative From a229b351f8b63c0c818d69c6c6f7b1ba5dc20116 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Sep 2022 10:56:45 +0200 Subject: [PATCH 56/68] Test and document Isomap on sparse data --- sklearn/manifold/_isomap.py | 8 ++++---- sklearn/manifold/tests/test_isomap.py | 22 +++++++++++----------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index 0d3c2ecc7b0d3..f7e8d8b669812 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -330,9 +330,9 @@ def fit(self, X, y=None): Parameters ---------- - X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors} + X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors} Sample data, shape = (n_samples, n_features), in the form of a - numpy array, sparse graph, precomputed tree, or NearestNeighbors + numpy array, sparse matrix, precomputed tree, or NearestNeighbors object. y : Ignored @@ -352,7 +352,7 @@ def fit_transform(self, X, y=None): Parameters ---------- - X : {array-like, sparse graph, BallTree, KDTree} + X : {array-like, sparse matrix, BallTree, KDTree} Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. @@ -381,7 +381,7 @@ def transform(self, X): Parameters ---------- - X : array-like, shape (n_queries, n_features) + X : {array-like, sparse matrix}, shape (n_queries, n_features) If neighbors_algorithm='precomputed', X is assumed to be a distance matrix or a sparse graph of shape (n_queries, n_samples_fit). diff --git a/sklearn/manifold/tests/test_isomap.py b/sklearn/manifold/tests/test_isomap.py index 73365b08a5cfb..ccd4c2899d20a 100644 --- a/sklearn/manifold/tests/test_isomap.py +++ b/sklearn/manifold/tests/test_isomap.py @@ -216,19 +216,19 @@ def test_isomap_clone_bug(): assert model.nbrs_.n_neighbors == n_neighbors -def test_sparse_input(): +@pytest.mark.parametrize("eigen_solver", eigen_solvers) +@pytest.mark.parametrize("path_method", path_methods) +def test_sparse_input(eigen_solver, path_method): X = sparse_rand(100, 3, density=0.1, format="csr") - # Should not error - for eigen_solver in eigen_solvers: - for path_method in path_methods: - clf = manifold.Isomap( - n_components=2, - eigen_solver=eigen_solver, - path_method=path_method, - n_neighbors=8, - ) - clf.fit(X) + clf = manifold.Isomap( + n_components=2, + eigen_solver=eigen_solver, + path_method=path_method, + n_neighbors=8, + ) + clf.fit(X) + clf.transform(X) def test_isomap_fit_precomputed_radius_graph(): From 3e357a2b55645b0640e5d1d7148812ce4179c503 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Sep 2022 10:59:50 +0200 Subject: [PATCH 57/68] Test and document TSNE on sparse data --- sklearn/manifold/_t_sne.py | 11 +++++++---- sklearn/manifold/tests/test_t_sne.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 6da992900afe2..8813ee7ae5de0 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -461,11 +461,12 @@ def trustworthiness(X, X_embedded, *, n_neighbors=5, metric="euclidean"): Parameters ---------- - X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. - X_embedded : ndarray of shape (n_samples, n_components) + X_embedded : {array-like, sparse matrix} of shape (n_samples, n_components) Embedding of the training data in low-dimensional space. n_neighbors : int, default=5 @@ -1095,7 +1096,8 @@ def fit_transform(self, X, y=None): Parameters ---------- - X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. If the method is 'exact', X may be a sparse matrix of type 'csr', 'csc' @@ -1121,7 +1123,8 @@ def fit(self, X, y=None): Parameters ---------- - X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. If the method is 'exact', X may be a sparse matrix of type 'csr', 'csc' diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 51239d7ffd2cc..2df9e33b01a39 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -329,7 +329,7 @@ def test_optimization_minimizes_kl_divergence(): @pytest.mark.parametrize("method", ["exact", "barnes_hut"]) -def test_fit_csr_matrix(method): +def test_fit_transform_csr_matrix(method): # X can be a sparse matrix. rng = check_random_state(0) X = rng.randn(50, 2) From 97231159ee7a21627e6cb7dc406d611850c465ce Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Sep 2022 11:06:57 +0200 Subject: [PATCH 58/68] Test and document pairwise_distances_argmin on sparse data --- sklearn/metrics/pairwise.py | 4 ++-- sklearn/metrics/tests/test_pairwise.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 062dc61d63a96..e1b6afd36bd3b 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -721,10 +721,10 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs Parameters ---------- - X : array-like of shape (n_samples_X, n_features) + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) Array containing points. - Y : array-like of shape (n_samples_Y, n_features) + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) Arrays containing points. axis : int, default=1 diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 31964e2d182dd..d2ade2650832a 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -446,7 +446,9 @@ def test_pairwise_distances_argmin_min(dtype): assert_array_almost_equal(vals, expected_vals) # sparse matrix case idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") + idxsp2 = pairwise_distances_argmin(Xsp, Ysp, metric="euclidean") assert_array_almost_equal(idxsp, expected_idx) + assert_array_almost_equal(idxsp2, expected_idx) assert_array_almost_equal(valssp, expected_vals) # We don't want np.matrix here assert type(idxsp) == np.ndarray @@ -478,7 +480,9 @@ def test_pairwise_distances_argmin_min(dtype): assert_array_almost_equal(vals, expected_vals) # sparse matrix case idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") + idxsp2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan") assert_array_almost_equal(idxsp, expected_idx) + assert_array_almost_equal(idxsp2, expected_idx) assert_array_almost_equal(valssp, expected_vals) # Non-euclidean Scipy distance (callable) From faf704aae97927c5fae50077d450ec3a37b17424 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Sep 2022 11:12:14 +0200 Subject: [PATCH 59/68] Test and document LocalOutlierFactor on sparse data --- sklearn/neighbors/_lof.py | 10 +++++----- sklearn/neighbors/tests/test_lof.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index 8dcfe704d3e06..ecfb1712f8503 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -238,7 +238,7 @@ def fit_predict(self, X, y=None): Parameters ---------- - X : array-like of shape (n_samples, n_features), default=None + X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. @@ -335,7 +335,7 @@ def predict(self, X=None): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. @@ -353,7 +353,7 @@ def _predict(self, X=None): Parameters ---------- - X : array-like of shape (n_samples, n_features), default=None + X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. If None, makes prediction on the training data without considering them as their own neighbors. @@ -403,7 +403,7 @@ def decision_function(self, X): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. the training samples. @@ -446,7 +446,7 @@ def score_samples(self, X): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. the training samples. diff --git a/sklearn/neighbors/tests/test_lof.py b/sklearn/neighbors/tests/test_lof.py index fb8b0e1e3627f..5b897ee6b50d2 100644 --- a/sklearn/neighbors/tests/test_lof.py +++ b/sklearn/neighbors/tests/test_lof.py @@ -5,6 +5,8 @@ from math import sqrt import numpy as np +from scipy.sparse import csr_matrix + from sklearn import neighbors import re import pytest @@ -226,3 +228,17 @@ def test_predicted_outlier_number(expected_outliers): if num_outliers != expected_outliers: y_dec = clf.negative_outlier_factor_ check_outlier_corruption(num_outliers, expected_outliers, y_dec) + + +def test_sparse(): + # LocalOutlierFactor must support CSR inputs + X = csr_matrix(iris.data) + + lof = neighbors.LocalOutlierFactor(novelty=True) + lof.fit(X) + lof.predict(X) + lof.score_samples(X) + lof.decision_function(X) + + lof = neighbors.LocalOutlierFactor(novelty=False) + lof.fit_predict(X) From c66bb82c665f1516b366327bf64104b3a19ff118 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Sep 2022 11:20:21 +0200 Subject: [PATCH 60/68] DOC Add support for sparse data for NearestNeighbors, KNeighbors*, RadiusNeighbors* methods --- sklearn/neighbors/_base.py | 14 ++++++-------- sklearn/neighbors/_classification.py | 8 ++++---- sklearn/neighbors/_regression.py | 4 ++-- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 8cce5b6226cbb..3a0a702be3792 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -713,9 +713,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): Parameters ---------- - X : array-like, shape (n_queries, n_features), \ - or (n_queries, n_indexed) if metric == 'precomputed', \ - default=None + X : {array-like, sparse matrix}, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', default=None The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. @@ -901,9 +900,8 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): Parameters ---------- - X : array-like of shape (n_queries, n_features), \ - or (n_queries, n_indexed) if metric == 'precomputed', \ - default=None + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', default=None The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. @@ -1046,7 +1044,7 @@ def radius_neighbors( Parameters ---------- - X : array-like of (n_samples, n_features), default=None + X : {array-like, sparse matrix} of (n_samples, n_features), default=None The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. @@ -1251,7 +1249,7 @@ def radius_neighbors_graph( Parameters ---------- - X : array-like of shape (n_samples, n_features), default=None + X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 884ca7804381d..3f8730675ef1b 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -219,7 +219,7 @@ def predict(self, X): Parameters ---------- - X : array-like of shape (n_queries, n_features), \ + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. @@ -266,7 +266,7 @@ def predict_proba(self, X): Parameters ---------- - X : array-like of shape (n_queries, n_features), \ + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. @@ -601,7 +601,7 @@ def predict(self, X): Parameters ---------- - X : array-like of shape (n_queries, n_features), \ + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. @@ -643,7 +643,7 @@ def predict_proba(self, X): Parameters ---------- - X : array-like of shape (n_queries, n_features), \ + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 86ffb0b77ff78..27b4a1c42a1b3 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -221,7 +221,7 @@ def predict(self, X): Parameters ---------- - X : array-like of shape (n_queries, n_features), \ + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. @@ -448,7 +448,7 @@ def predict(self, X): Parameters ---------- - X : array-like of shape (n_queries, n_features), \ + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ or (n_queries, n_indexed) if metric == 'precomputed' Test samples. From 1eb5b2c3ce31f96eff0922adbdd236c5f0a5c879 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Sep 2022 11:27:28 +0200 Subject: [PATCH 61/68] DOC Remove formatting change --- doc/whats_new/v1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index c6a2559a45945..e904a630296a3 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -328,7 +328,7 @@ Changelog - |Fix| Allows `csr_matrix` as input for parameter: `y_true` of the :func:`metrics.label_ranking_average_precision_score` metric. :pr:`23442` by :user:`Sean Atukorala ` - + - |Fix| :func:`metrics.ndcg_score` will now trigger a warning when the `y_true` value contains a negative value. Users may still use negative values, but the result may not be between 0 and 1. Starting in v1.4, passing in negative From fcf15b65fc51461d43c3bb14d78f2236f06d44fb Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 15 Sep 2022 13:55:53 +0200 Subject: [PATCH 62/68] TST Do not test on full cartesian product Co-authored-by: Olivier Grisel --- .../tests/test_pairwise_distances_reduction.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 7302e3369096d..df66695c95011 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -787,13 +787,15 @@ def test_n_threads_agnosticism( ) -@pytest.mark.parametrize("n_samples", [100, 1000]) -@pytest.mark.parametrize("chunk_size", [50, 512, 1024]) -@pytest.mark.parametrize( - "Dispatcher", - [ArgKmin, RadiusNeighbors], +pytest.mark.parametrize( + "n_samples, chunk_size, Dispatcher, dtype", + [ + (100, 50, ArgKmin, np.float64), + (1024, 256, RadiusNeighbors, np.float32), + (100, 1024, ArgKmin, np.float32), + (541, 137, RadiusNeighbors, np.float64), + ], ) -@pytest.mark.parametrize("dtype", [np.float64, np.float32]) def test_format_agnosticism( global_random_seed, Dispatcher, From 58453d7ff9d5b0f839e4e18c5fae21c39108a764 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 15 Sep 2022 14:12:56 +0200 Subject: [PATCH 63/68] fixup! TST Do not test on full cartesian product --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index df66695c95011..b6f0b5c0b1b1d 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -787,7 +787,7 @@ def test_n_threads_agnosticism( ) -pytest.mark.parametrize( +@pytest.mark.parametrize( "n_samples, chunk_size, Dispatcher, dtype", [ (100, 50, ArgKmin, np.float64), @@ -798,9 +798,9 @@ def test_n_threads_agnosticism( ) def test_format_agnosticism( global_random_seed, - Dispatcher, n_samples, chunk_size, + Dispatcher, dtype, n_features=100, ): From 63fda8c8b3c41d5a84cac6dec829370ace5212e4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 15 Sep 2022 14:25:11 +0200 Subject: [PATCH 64/68] TST Add TODO for consistency checks on results for sparse and dense data Co-authored-by: Olivier Grisel --- sklearn/manifold/tests/test_isomap.py | 2 ++ sklearn/manifold/tests/test_t_sne.py | 2 ++ sklearn/neighbors/tests/test_lof.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/sklearn/manifold/tests/test_isomap.py b/sklearn/manifold/tests/test_isomap.py index ccd4c2899d20a..a3db88c3c971f 100644 --- a/sklearn/manifold/tests/test_isomap.py +++ b/sklearn/manifold/tests/test_isomap.py @@ -219,6 +219,8 @@ def test_isomap_clone_bug(): @pytest.mark.parametrize("eigen_solver", eigen_solvers) @pytest.mark.parametrize("path_method", path_methods) def test_sparse_input(eigen_solver, path_method): + # TODO: compare results on dense and sparse data as proposed in: + # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186 X = sparse_rand(100, 3, density=0.1, format="csr") clf = manifold.Isomap( diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 2df9e33b01a39..4b00c7b228969 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -330,6 +330,8 @@ def test_optimization_minimizes_kl_divergence(): @pytest.mark.parametrize("method", ["exact", "barnes_hut"]) def test_fit_transform_csr_matrix(method): + # TODO: compare results on dense and sparse data as proposed in: + # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186 # X can be a sparse matrix. rng = check_random_state(0) X = rng.randn(50, 2) diff --git a/sklearn/neighbors/tests/test_lof.py b/sklearn/neighbors/tests/test_lof.py index 5b897ee6b50d2..ae636ee5d64a1 100644 --- a/sklearn/neighbors/tests/test_lof.py +++ b/sklearn/neighbors/tests/test_lof.py @@ -232,6 +232,8 @@ def test_predicted_outlier_number(expected_outliers): def test_sparse(): # LocalOutlierFactor must support CSR inputs + # TODO: compare results on dense and sparse data as proposed in: + # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186 X = csr_matrix(iris.data) lof = neighbors.LocalOutlierFactor(novelty=True) From 1d7bcc7cf89d87e30cb88cc648124d43bd80e26f Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 15 Sep 2022 15:21:02 +0200 Subject: [PATCH 65/68] MAINT Mark PairwiseDistancesReductions as unusable for some config. --- .../_dispatcher.py | 20 ++++++++++++++++++- .../test_pairwise_distances_reduction.py | 15 +++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 7b4cc94306494..00f6936e5a982 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -110,7 +110,7 @@ def is_valid_sparse_matrix(X): X.indices.dtype == X.indptr.dtype == np.int32 ) - return ( + is_usable = ( get_config().get("enable_cython_pairwise_dist", True) and (is_numpy_c_ordered(X) or is_valid_sparse_matrix(X)) and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y)) @@ -119,6 +119,24 @@ def is_valid_sparse_matrix(X): and metric in cls.valid_metrics() ) + # The other joblib-based back-end might be more efficient on fused sparse-dense + # datasets' pairs on metric="(sq)euclidean" for some configurations because it + # uses the Squared Euclidean matrix decomposition, i.e.: + # + # ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + # + # calling efficient sparse-dense routines for matrix and vectors multiplication + # implemented in SciPy we do not use yet here. + # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa + # TODO: implement specialisation for (sq)euclidean on fused sparse-dense + # using sparse-dense routines for matrix-vector multiplications. + fused_sparse_dense_euclidean_case_guard = not ( + (is_valid_sparse_matrix(X) or is_valid_sparse_matrix(Y)) + and "euclidean" in metric + ) + + return is_usable and fused_sparse_dense_euclidean_case_guard + @classmethod @abstractmethod def compute( diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index b6f0b5c0b1b1d..a2a03530e2445 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -518,7 +518,7 @@ def test_pairwise_distances_reduction_is_usable_for(): Y = rng.rand(100, 10) X_csr = csr_matrix(X) Y_csr = csr_matrix(Y) - metric = "euclidean" + metric = "manhattan" # Must be usable for all possible pair of {dense, sparse} datasets assert BaseDistanceReductionDispatcher.is_usable_for(X, Y, metric) @@ -551,6 +551,19 @@ def test_pairwise_distances_reduction_is_usable_for(): np.asfortranarray(X), Y, metric ) + # We prefer not to use those implementations for fused sparse-dense when + # metric="(sq)euclidean" because it's not yet the most efficient one on + # all configurations of datasets. + # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa + # TODO: implement specialisation for (sq)euclidean on fused sparse-dense + # using sparse-dense routines for matrix-vector multiplications. + assert not BaseDistanceReductionDispatcher.is_usable_for( + X_csr, Y, metric="euclidean" + ) + assert not BaseDistanceReductionDispatcher.is_usable_for( + X_csr, Y_csr, metric="sqeuclidean" + ) + # CSR matrices without non-zeros elements aren't currently supported # TODO: support CSR matrices without non-zeros elements X_csr_0_nnz = csr_matrix(X * 0) From fec55bf54418c85ffc7b4ea742994b3e57964e09 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 15 Sep 2022 16:27:48 +0200 Subject: [PATCH 66/68] fixup! MAINT Mark PairwiseDistancesReductions as unusable for some config. --- sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 00f6936e5a982..809d683b52ced 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -132,6 +132,7 @@ def is_valid_sparse_matrix(X): # using sparse-dense routines for matrix-vector multiplications. fused_sparse_dense_euclidean_case_guard = not ( (is_valid_sparse_matrix(X) or is_valid_sparse_matrix(Y)) + and isinstance(metric, str) and "euclidean" in metric ) From d55bcec61594c366b9649462d701d81eea9b3a74 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 20 Sep 2022 09:56:54 +0200 Subject: [PATCH 67/68] TST Improve test_format_agnosticism MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …and add a space somewhere for proper formatting. Co-authored-by: Meekail Zain --- .../_datasets_pair.pyx.tp | 2 +- .../test_pairwise_distances_reduction.py | 73 ++++++------------- 2 files changed, 24 insertions(+), 51 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 2705952c73d87..cfa37a004f17a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -231,7 +231,7 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): @final cdef ITYPE_t n_samples_Y(self) nogil: - return self.Y_indptr.shape[0] -1 + return self.Y_indptr.shape[0] - 1 @final cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index a2a03530e2445..49768b4e80364 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1,3 +1,4 @@ +import itertools import re from collections import defaultdict @@ -815,11 +816,12 @@ def test_format_agnosticism( chunk_size, Dispatcher, dtype, - n_features=100, ): # Results must not depend on the number of threads rng = np.random.RandomState(global_random_seed) spread = 100 + n_features = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread Y = rng.rand(n_samples, n_features).astype(dtype) * spread @@ -837,62 +839,33 @@ def test_format_agnosticism( check_parameters = {"radius": radius} compute_parameters = {"sort_results": True} - # XXX: use itertools.pairwise when available? - dist_dense_dense, indices_dense_dense = Dispatcher.compute( - X, - Y, - parameter, - return_distance=True, - **compute_parameters, - ) - - dist_sparse_sparse, indices_sparse_sparse = Dispatcher.compute( - X_csr, - Y_csr, - parameter, - return_distance=True, - **compute_parameters, - ) - - ASSERT_RESULT[(Dispatcher, dtype)]( - dist_dense_dense, - dist_sparse_sparse, - indices_dense_dense, - indices_sparse_sparse, - **check_parameters, - ) - - dist_dense_sparse, indices_dense_sparse = Dispatcher.compute( + dist_dense, indices_dense = Dispatcher.compute( X, - Y_csr, - parameter, - return_distance=True, - **compute_parameters, - ) - - ASSERT_RESULT[(Dispatcher, dtype)]( - dist_dense_dense, - dist_dense_sparse, - indices_dense_dense, - indices_dense_sparse, - **check_parameters, - ) - - dist_sparse_dense, indices_sparse_dense = Dispatcher.compute( - X_csr, Y, parameter, + chunk_size=chunk_size, return_distance=True, **compute_parameters, ) - ASSERT_RESULT[(Dispatcher, dtype)]( - dist_dense_dense, - dist_sparse_dense, - indices_dense_dense, - indices_sparse_dense, - **check_parameters, - ) + for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)): + if _X is X and _Y is Y: + continue + dist, indices = Dispatcher.compute( + _X, + _Y, + parameter, + chunk_size=chunk_size, + return_distance=True, + **compute_parameters, + ) + ASSERT_RESULT[(Dispatcher, dtype)]( + dist_dense, + dist, + indices_dense, + indices, + **check_parameters, + ) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed From c21187a1fe7e494ab9c3cd9317134c4e4777044c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 20 Sep 2022 10:01:38 +0200 Subject: [PATCH 68/68] DOC Update comment regarding the use of pairwise_distances_chunked See: `BaseDistanceReductionDispatcher.valid_metrics` Co-authored-by: Meekail Zain --- sklearn/metrics/pairwise.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index e1b6afd36bd3b..96c3a216497ab 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -688,8 +688,9 @@ def pairwise_distances_argmin_min( values = values.flatten() indices = indices.flatten() else: - # TODO: once ArgKmin supports sparse input matrices and 32 bit, - # we won't need to fallback to pairwise_distances_chunked anymore. + # TODO: once BaseDistanceReductionDispatcher supports distance metrics + # for boolean datasets, we won't need to fallback to + # pairwise_distances_chunked anymore. # Turn off check for finiteness because this is costly and because arrays # have already been validated. @@ -799,8 +800,9 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs ) indices = indices.flatten() else: - # TODO: once ArgKmin supports sparse input matrices and 32 bit, - # we won't need to fallback to pairwise_distances_chunked anymore. + # TODO: once BaseDistanceReductionDispatcher supports distance metrics + # for boolean datasets, we won't need to fallback to + # pairwise_distances_chunked anymore. # Turn off check for finiteness because this is costly and because arrays # have already been validated.