From b8bd8757e483541f38ca29837ea033194014356e Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Sat, 11 Jun 2022 09:37:24 +0200 Subject: [PATCH 01/28] MAINT Implement CSR support for all DistanceMetric --- sklearn/metrics/_dist_metrics.pxd.tp | 61 +- sklearn/metrics/_dist_metrics.pyx.tp | 1277 +++++++++++++++++++- sklearn/metrics/tests/test_dist_metrics.py | 70 +- 3 files changed, 1331 insertions(+), 77 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 32ba546672c6e..2ba4545dc02fc 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -79,8 +79,8 @@ cdef class DistanceMetric{{name_suffix}}: # Because we don't expect to instantiate a lot of these objects, the # extra memory overhead of this setup should not be an issue. cdef {{DTYPE_t}} p - cdef {{DTYPE_t}}[::1] vec - cdef {{DTYPE_t}}[:, ::1] mat + cdef DTYPE_t[::1] vec + cdef DTYPE_t[:, ::1] mat cdef ITYPE_t size cdef object func cdef object kwargs @@ -91,10 +91,59 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1 - cdef int pdist(self, const {{DTYPE_t}}[:, ::1] X, {{DTYPE_t}}[:, ::1] D) except -1 - - cdef int cdist(self, const {{DTYPE_t}}[:, ::1] X, const {{DTYPE_t}}[:, ::1] Y, - {{DTYPE_t}}[:, ::1] D) except -1 + cdef DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1 + + cdef DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1 + + cdef int pdist(self, + const {{DTYPE_t}}[:, ::1] X, + DTYPE_t[:, ::1] D, + ) except -1 + + cdef int cdist(self, + const {{DTYPE_t}}[:, ::1] X, + const {{DTYPE_t}}[:, ::1] Y, + DTYPE_t[:, ::1] D, + ) except -1 + + cdef int csr_pdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const cnp.int32_t[:] x1_indptr, + const ITYPE_t size, + DTYPE_t[:, ::1] D, + ) nogil except -1 + + cdef int csr_cdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const cnp.int32_t[:] x1_indptr, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t[:] x2_indptr, + const ITYPE_t size, + DTYPE_t[:, ::1] D, + ) nogil except -1 cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1 diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 5986fa939b45d..51108a18500ca 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -250,8 +250,8 @@ cdef class DistanceMetric{{name_suffix}}: """ def __cinit__(self): self.p = 2 - self.vec = np.zeros(1, dtype={{DTYPE}}, order='C') - self.mat = np.zeros((1, 1), dtype={{DTYPE}}, order='C') + self.vec = np.zeros(1, dtype=DTYPE, order='C') + self.mat = np.zeros((1, 1), dtype=DTYPE, order='C') self.size = 1 def __reduce__(self): @@ -356,8 +356,11 @@ cdef class DistanceMetric{{name_suffix}}: """ return self.dist(x1, x2, size) - cdef int pdist(self, const {{DTYPE_t}}[:, ::1] X, {{DTYPE_t}}[:, ::1] D) except -1: - """compute the pairwise distances between points in X""" + cdef int pdist(self, + const {{DTYPE_t}}[:, ::1] X, + DTYPE_t[:, ::1] D, + ) except -1: + """Compute the pairwise distances between points in X""" cdef ITYPE_t i1, i2 for i1 in range(X.shape[0]): for i2 in range(i1, X.shape[0]): @@ -365,9 +368,13 @@ cdef class DistanceMetric{{name_suffix}}: D[i2, i1] = D[i1, i2] return 0 - cdef int cdist(self, const {{DTYPE_t}}[:, ::1] X, const {{DTYPE_t}}[:, ::1] Y, - {{DTYPE_t}}[:, ::1] D) except -1: - """compute the cross-pairwise distances between arrays X and Y""" + + cdef int cdist(self, + const {{DTYPE_t}}[:, ::1] X, + const {{DTYPE_t}}[:, ::1] Y, + DTYPE_t[:, ::1] D, + ) except -1: + """Compute the cross-pairwise distances between arrays X and Y""" cdef ITYPE_t i1, i2 if X.shape[1] != Y.shape[1]: raise ValueError('X and Y must have the same second dimension') @@ -376,6 +383,136 @@ cdef class DistanceMetric{{name_suffix}}: D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) return 0 + cdef DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + """Compute the distance between vectors x1 and x2 represented + under the CSR format. + + This must be overridden in a base class. + + Note that we pass all the parameter as to not use memoryview slicing + because it is currently known to slow down execution as it + takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 + """ + return -999 + + cdef DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + """Compute the distance between vectors x1 and x2 represented + under the CSR format. + + This can optionally be overridden in a base class. + + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For example, the + rank-preserving surrogate distance of the Euclidean metric is the + squared-euclidean distance. + + Note that we pass all the parameter as to not use memoryview slicing + because it is currently known to slow down execution as it + takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 + """ + return self.csr_dist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ) + + cdef int csr_pdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const cnp.int32_t[:] x1_indptr, + const ITYPE_t size, + DTYPE_t[:, ::1] D, + ) nogil except -1: + """Compute the pairwise distances between points in X + represented in the CSR format.""" + cdef: + ITYPE_t i1, i2 + ITYPE_t n_x1 = x1_indptr.shape[0] - 1 + ITYPE_t x1_start, x1_end, x2_start, x2_end + + for i1 in range(n_x1): + x1_start = x1_indptr[i1] + x1_end = x1_indptr[i1 + 1] + for i2 in range(i1, n_x1): + x2_start = x1_indptr[i2] + x2_end = x1_indptr[i2 + 1] + D[i1, i2] = D[i2, i1] = self.csr_dist( + x1_data, + x1_indices, + x1_data, + x1_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ) + return 0 + + cdef int csr_cdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const cnp.int32_t[:] x1_indptr, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t[:] x2_indptr, + const ITYPE_t size, + DTYPE_t[:, ::1] D, + ) nogil except -1: + """Compute the cross-pairwise distances between arrays X and Y + represented in the CSR format.""" + cdef: + ITYPE_t i1, i2 + ITYPE_t n_x1 = x1_indptr.shape[0] - 1 + ITYPE_t n_x2 = x2_indptr.shape[0] - 1 + ITYPE_t x1_start, x1_end, x2_start, x2_end + + for i1 in range(n_x1): + x1_start = x1_indptr[i1] + x1_end = x1_indptr[i1 + 1] + for i2 in range(n_x2): + x2_start = x2_indptr[i2] + x2_end = x2_indptr[i2 + 1] + + D[i1, i2] = self.csr_dist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ) + return 0 + cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: """Convert the rank-preserving surrogate distance to the distance""" return rdist @@ -424,6 +561,61 @@ cdef class DistanceMetric{{name_suffix}}: """ return dist + def _pairwise_dense(self, X, Y=None): + cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Xarr + cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Yarr + cdef cnp.ndarray[DTYPE_t, ndim=2, mode='c'] Darr + + Xarr = np.asarray(X, dtype={{DTYPE}}, order='C') + self._validate_data(Xarr) + if Y is None: + Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), + dtype=DTYPE, order='C') + self.pdist(Xarr, Darr) + else: + Yarr = np.asarray(Y, dtype={{DTYPE}}, order='C') + self._validate_data(Yarr) + Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), + dtype=DTYPE, order='C') + self.cdist(Xarr, Yarr, Darr) + return Darr + + def _pairwise_sparse(self, X, Y=None): + X_csr = X.tocsr() + n_X, size = X_csr.shape + X_data = np.asarray(X_csr.data, dtype={{DTYPE}}) + X_indices = np.asarray(X_csr.indices, dtype=np.int32) + X_indptr = np.asarray(X_csr.indptr, dtype=np.int32) + + if Y is None: + Darr = np.zeros((n_X, n_X), dtype=DTYPE, order='C') + self.csr_pdist( + x1_data=X_data, + x1_indices=X_indices, + x1_indptr=X_indptr, + size=size, + D=Darr, + ) + else: + Y_csr = Y.tocsr() + n_Y, _ = Y_csr.shape + Y_data = np.asarray(Y_csr.data, dtype={{DTYPE}}) + Y_indices = np.asarray(Y_csr.indices, dtype=np.int32) + Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32) + + Darr = np.zeros((n_X, n_Y), dtype=DTYPE, order='C') + self.csr_cdist( + x1_data=X_data, + x1_indices=X_indices, + x1_indptr=X_indptr, + x2_data=Y_data, + x2_indices=Y_indices, + x2_indptr=Y_indptr, + size=size, + D=Darr, + ) + return Darr + def pairwise(self, X, Y=None): """Compute the pairwise distances between X and Y @@ -445,23 +637,10 @@ cdef class DistanceMetric{{name_suffix}}: The shape (Nx, Ny) array of pairwise distances between points in X and Y. """ - cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Xarr - cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Yarr - cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Darr - - Xarr = np.asarray(X, dtype={{DTYPE}}, order='C') - self._validate_data(Xarr) - if Y is None: - Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), - dtype={{DTYPE}}, order='C') - self.pdist(Xarr, Darr) - else: - Yarr = np.asarray(Y, dtype={{DTYPE}}, order='C') - self._validate_data(Yarr) - Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), - dtype={{DTYPE}}, order='C') - self.cdist(Xarr, Yarr, Darr) - return Darr + if not issparse(X) and (Y is None or not issparse(Y)): + return self._pairwise_dense(X, Y) + if issparse(X) and (Y is None or issparse(Y)): + return self._pairwise_sparse(X, Y) #------------------------------------------------------------ @@ -496,6 +675,79 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 + cdef inline DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + DTYPE_t unsquared = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + unsquared = (x1_data[i1] - x2_data[i2]) + d = d + (unsquared * unsquared) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) + i1 = i1 + 1 + else: + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) + i2 = i2 + 1 + else: + while i1 < x1_end: + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) + i1 = i1 + 1 + + return d + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return sqrt(self.csr_rdist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + )) #------------------------------------------------------------ # SEuclidean Distance @@ -507,7 +759,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} } """ def __init__(self, V): - self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype={{DTYPE}})) + self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype=DTYPE)) self.size = self.vec.shape[0] self.p = 2 @@ -540,6 +792,80 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 + cdef inline DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + DTYPE_t unsquared = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + unsquared = (x1_data[i1] - x2_data[i2]) + d = d + (unsquared * unsquared) / self.vec[ix1] + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) / self.vec[ix1] + i1 = i1 + 1 + else: + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) / self.vec[ix2] + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) / self.vec[ix1] + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) / self.vec[ix2] + i1 = i1 + 1 + return d + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return sqrt(self.csr_rdist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + )) #------------------------------------------------------------ # Manhattan Distance @@ -561,6 +887,51 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += fabs(x1[j] - x2[j]) return d + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + fabs(x1_data[i1] - x2_data[i2]) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + fabs(x1_data[i1]) + i1 = i1 + 1 + else: + d = d + fabs(x2_data[i2]) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d = d + fabs(x2_data[i2]) + i2 = i2 + 1 + else: + while i1 < x1_end: + d = d + fabs(x1_data[i1]) + i1 = i1 + 1 + + return d + #------------------------------------------------------------ # Chebyshev Distance @@ -595,6 +966,52 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = fmax(d, fabs(x1_data[i1])) + i1 = i1 + 1 + else: + d = fmax(d, fabs(x2_data[i2])) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d = fmax(d, fabs(x2_data[i2])) + i2 = i2 + 1 + else: + while i1 < x1_end: + d = fmax(d, fabs(x1_data[i1])) + i1 = i1 + 1 + + return d + + #------------------------------------------------------------ # Minkowski Distance cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): @@ -632,14 +1049,14 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): self.p = p if w is not None: w_array = check_array( - w, ensure_2d=False, dtype={{DTYPE}}, input_name="w" + w, ensure_2d=False, dtype=DTYPE, input_name="w" ) if (w_array < 0).any(): raise ValueError("w cannot contain negative weights") self.vec = ReadonlyArrayWrapper(w_array) self.size = self.vec.shape[0] else: - self.vec = ReadonlyArrayWrapper(np.asarray([], dtype={{DTYPE}})) + self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=DTYPE)) self.size = 0 def _validate_data(self, X): @@ -677,6 +1094,106 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** self.p + cdef inline DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + bint has_w = self.size > 0 + + if has_w: + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + else: + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + else: + while i1 < x1_end: + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + + return d + else: + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + (pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + (pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + else: + d = d + (pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + d = d + (pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + d = d + (pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + + return d + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return pow( + self.csr_rdist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ), + 1 / self.p + ) #------------------------------------------------------------ # TODO: Remove in 1.3 - WMinkowskiDistance class @@ -714,7 +1231,7 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise ValueError("WMinkowskiDistance requires finite p. " "For p=inf, use ChebyshevDistance.") self.p = p - self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype={{DTYPE}})) + self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype=DTYPE)) self.size = self.vec.shape[0] def _validate_data(self, X): @@ -747,6 +1264,78 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** self.p + cdef inline DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + pow(self.vec[ix1] * fabs(x1_data[i1] - x2_data[i2]), self.p) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + i1 = i1 + 1 + else: + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + i1 = i1 + 1 + + return d + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return pow( + self.csr_rdist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ), + 1 / self.p + ) #------------------------------------------------------------ # Mahalanobis Distance @@ -775,12 +1364,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if VI.ndim != 2 or VI.shape[0] != VI.shape[1]: raise ValueError("V/VI must be square") - self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype={{DTYPE}}, order='C')) + self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype=DTYPE, order='C')) self.size = self.mat.shape[0] # we need vec as a work buffer - self.vec = np.zeros(self.size, dtype={{DTYPE}}) + self.vec = np.zeros(self.size, dtype=DTYPE) def _validate_data(self, X): if X.shape[1] != self.size: @@ -818,6 +1407,81 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 + cdef inline DTYPE_t csr_rdist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t tmp, d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + self.vec[ix1] = x1_data[i1] - x2_data[i2] + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + self.vec[ix1] = x1_data[i1] + i1 = i1 + 1 + else: + self.vec[ix2] = - x2_data[i2] + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + self.vec[ix2] = - x2_data[i2] + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + self.vec[ix1] = x1_data[i1] + i1 = i1 + 1 + + for i in range(size): + tmp = 0 + for j in range(size): + tmp += self.mat[i, j] * self.vec[j] + d += tmp * self.vec[i] + + return d + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return sqrt(self.csr_rdist( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + )) #------------------------------------------------------------ # Hamming Distance @@ -841,6 +1505,54 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return float(n_unequal) / size + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d += (x1_data[i1] != x2_data[i2]) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d += 1 + i1 = i1 + 1 + else: + d += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + d += 1 + i1 = i1 + 1 + + d /= size + + return d + + #------------------------------------------------------------ # Canberra Distance # D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ] @@ -863,6 +1575,50 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (fabs(x1[j] - x2[j])) / denom return d + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d += fabs(x1_data[i1] - x2_data[i2]) / (fabs(x1_data[i1]) + fabs(x2_data[i2])) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d += 1. + i1 = i1 + 1 + else: + d += 1. + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d += 1. + i2 = i2 + 1 + else: + while i1 < x1_end: + d += 1. + i1 = i1 + 1 + + return d #------------------------------------------------------------ # Bray-Curtis Distance @@ -888,6 +1644,56 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): else: return 0.0 + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + DTYPE_t num = 0.0 + DTYPE_t denom = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + num += fabs(x1_data[i1] - x2_data[i2]) + denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) + i1 = i1 + 1 + else: + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) + i2 = i2 + 1 + else: + while i1 < x1_end: + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) + i1 = i1 + 1 + + return num / denom #------------------------------------------------------------ # Jaccard Distance (boolean) @@ -903,20 +1709,73 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_eq = 0, nnz = 0 + cdef int tf1, tf2, n_tt = 0, nnz = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 nnz += (tf1 or tf2) - n_eq += (tf1 and tf2) + n_tt += (tf1 and tf2) # Based on https://github.com/scipy/scipy/pull/7373 # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric # was changed to return 0, instead of nan. if nnz == 0: return 0 - return (nnz - n_eq) * 1.0 / nnz + return (nnz - n_tt) * 1.0 / nnz + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_tt = 0, nnz = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + nnz += (tf1 or tf2) + n_tt += (tf1 and tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + nnz += tf1 + i1 = i1 + 1 + else: + nnz += tf2 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + nnz += tf2 + i2 = i2 + 1 + else: + while i1 < x1_end: + nnz += tf1 + i1 = i1 + 1 + # Based on https://github.com/scipy/scipy/pull/7373 + # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric + # was changed to return 0, instead of nan. + if nnz == 0: + return 0 + return (nnz - n_tt) * 1.0 / nnz #------------------------------------------------------------ # Matching Distance (boolean) @@ -940,6 +1799,52 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return n_neq * 1. / size + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + + return n_neq * 1.0 / size #------------------------------------------------------------ # Dice Distance (boolean) @@ -956,14 +1861,63 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_neq = 0, ntt = 0 + cdef int tf1, tf2, n_neq = 0, n_tt = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 - ntt += (tf1 and tf2) + n_tt += (tf1 and tf2) n_neq += (tf1 != tf2) - return n_neq / (2.0 * ntt + n_neq) + return n_neq / (2.0 * n_tt + n_neq) + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + + return n_neq / (2.0 * n_tt + n_neq) #------------------------------------------------------------ @@ -981,15 +1935,63 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0, n_neq = 0 + cdef int tf1, tf2, n_tt = 0, n_neq = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 n_neq += (tf1 != tf2) - ntt += (tf1 and tf2) - return (n_neq - ntt + size) * 1.0 / (n_neq + size) + n_tt += (tf1 and tf2) + return (n_neq - n_tt + size) * 1.0 / (n_neq + size) + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + return (n_neq - n_tt + size) * 1.0 / (n_neq + size) #------------------------------------------------------------ # Rogers-Tanimoto Distance (boolean) @@ -1013,6 +2015,53 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + + return (2.0 * n_neq) / (size + n_neq) #------------------------------------------------------------ # Russell-Rao Distance (boolean) @@ -1028,13 +2077,55 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0 + cdef int tf1, tf2, n_tt = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 - ntt += (tf1 and tf2) - return (size - ntt) * 1. / size + n_tt += (tf1 and tf2) + return (size - n_tt) * 1. / size + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_tt = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + i1 = i1 + 1 + else: + i2 = i2 + 1 + + # We don't need to go through all the longuest + # vector because tf1 or tf2 will be false + # and thus n_tt won't be increased. + + return (size - n_tt) * 1. / size + #------------------------------------------------------------ @@ -1059,6 +2150,53 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + + return (2.0 * n_neq) / (size + n_neq) #------------------------------------------------------------ # Sokal-Sneath Distance (boolean) @@ -1074,14 +2212,63 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0, n_neq = 0 + cdef int tf1, tf2, n_tt = 0, n_neq = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 n_neq += (tf1 != tf2) - ntt += (tf1 and tf2) - return n_neq / (0.5 * ntt + n_neq) + n_tt += (tf1 and tf2) + return n_neq / (0.5 * n_tt + n_neq) + + cdef inline DTYPE_t csr_dist(self, + const {{DTYPE_t}}[:] x1_data, + const cnp.int32_t[:] x1_indices, + const {{DTYPE_t}}[:] x2_data, + const cnp.int32_t[:] x2_indices, + const cnp.int32_t x1_start, + const cnp.int32_t x1_end, + const cnp.int32_t x2_start, + const cnp.int32_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + + ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += 1 + i1 = i1 + 1 + else: + n_neq += 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += 1 + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += 1 + i1 = i1 + 1 + + return n_neq / (0.5 * n_tt + n_neq) #------------------------------------------------------------ diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 4cc8b945ffdab..8f1ddd662ca3f 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -83,14 +83,15 @@ def test_cdist(metric_param_grid, X, Y): ) metric, param_grid = metric_param_grid keys = param_grid.keys() + X_csr, Y_csr = sp.csr_matrix(X), sp.csr_matrix(Y) for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) + rtol_dict = {} if metric == "mahalanobis": - # See: https://github.com/scipy/scipy/issues/13861 - # Possibly caused by: https://github.com/joblib/joblib/issues/563 - pytest.xfail( - "scipy#13861: cdist with 'mahalanobis' fails on joblib memmap data" - ) + # Computation of mahalanobis differs between + # the scipy and scikit-learn implementation. + # Hence, we increase the relative tolerance. + rtol_dict = {"rtol": 1e-6} if metric == "wminkowski": # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 @@ -104,7 +105,10 @@ def test_cdist(metric_param_grid, X, Y): dm = DistanceMetricInterface.get_metric(metric, **kwargs) D_sklearn = dm.pairwise(X, Y) - assert_allclose(D_sklearn, D_scipy_cdist) + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) + + D_sklearn_csr = dm.pairwise(X_csr, Y_csr) + assert_allclose(D_sklearn_csr, D_scipy_cdist, **rtol_dict) @pytest.mark.parametrize("metric", BOOL_METRICS) @@ -112,28 +116,38 @@ def test_cdist(metric_param_grid, X, Y): "X_bool, Y_bool", [(X_bool, Y_bool), (X_bool_mmap, Y_bool_mmap)] ) def test_cdist_bool_metric(metric, X_bool, Y_bool): - D_true = cdist(X_bool, Y_bool, metric) + D_scipy_cdist = cdist(X_bool, Y_bool, metric) + dm = DistanceMetric.get_metric(metric) - D12 = dm.pairwise(X_bool, Y_bool) - assert_allclose(D12, D_true) + D_sklearn = dm.pairwise(X_bool, Y_bool) + assert_allclose(D_sklearn, D_scipy_cdist) + + X_bool_csr, Y_csr = sp.csr_matrix(X_bool), sp.csr_matrix(Y_bool) + D_sklearn_csr = dm.pairwise(X_bool_csr, Y_csr) + assert_allclose(D_sklearn_csr, D_scipy_cdist) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) -@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)]) -def test_pdist(metric_param_grid, X, Y): +@pytest.mark.parametrize("X", [X64, X32, X_mmap]) +def test_pdist(metric_param_grid, X): DistanceMetricInterface = ( - DistanceMetric if X.dtype == Y.dtype == np.float64 else DistanceMetric32 + DistanceMetric if X.dtype == np.float64 else DistanceMetric32 ) metric, param_grid = metric_param_grid keys = param_grid.keys() + X_csr = sp.csr_matrix(X) for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) + rtol_dict = {} if metric == "mahalanobis": - # See: https://github.com/scipy/scipy/issues/13861 - pytest.xfail("scipy#13861: pdist with 'mahalanobis' fails onmemmap data") - elif metric == "wminkowski": + # Computation of mahalanobis differs between + # the scipy and scikit-learn implementation. + # Hence, we increase the relative tolerance. + rtol_dict = {"rtol": 1e-6} + + if metric == "wminkowski": if sp_version >= parse_version("1.8.0"): pytest.skip("wminkowski will be removed in SciPy 1.8.0") @@ -142,13 +156,16 @@ def test_pdist(metric_param_grid, X, Y): if sp_version >= parse_version("1.6.0"): ExceptionToAssert = DeprecationWarning with pytest.warns(ExceptionToAssert): - D_true = cdist(X, X, metric, **kwargs) + D_scipy_pdist = cdist(X, X, metric, **kwargs) else: - D_true = cdist(X, X, metric, **kwargs) + D_scipy_pdist = cdist(X, X, metric, **kwargs) dm = DistanceMetricInterface.get_metric(metric, **kwargs) - D12 = dm.pairwise(X) - assert_allclose(D12, D_true) + D_sklearn = dm.pairwise(X) + assert_allclose(D_sklearn, D_scipy_pdist, **rtol_dict) + + D_sklearn_csr = dm.pairwise(X_csr) + assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @@ -166,25 +183,26 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): D64 = dm64.pairwise(X64) D32 = dm32.pairwise(X32) - assert_allclose(D64, D32) + assert_allclose(D64, D32, rtol=1e-5) D64 = dm64.pairwise(X64, Y64) D32 = dm32.pairwise(X32, Y32) - assert_allclose(D64, D32) + assert_allclose(D64, D32, rtol=1e-5) @pytest.mark.parametrize("metric", BOOL_METRICS) @pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap]) def test_pdist_bool_metrics(metric, X_bool): - D_true = cdist(X_bool, X_bool, metric) - dm = DistanceMetric.get_metric(metric) - D12 = dm.pairwise(X_bool) + D_scipy_pdist = cdist(X_bool, X_bool, metric) # Based on https://github.com/scipy/scipy/pull/7373 # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric # was changed to return 0, instead of nan. if metric == "jaccard" and sp_version < parse_version("1.2.0"): - D_true[np.isnan(D_true)] = 0 - assert_allclose(D12, D_true) + D_scipy_pdist[np.isnan(D_scipy_pdist)] = 0 + + dm = DistanceMetric.get_metric(metric) + D_sklearn = dm.pairwise(X_bool) + assert_allclose(D_sklearn, D_scipy_pdist) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed From fb9968017ec55c050b974bf62decc83454f75314 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 15 Jun 2022 09:50:18 +0200 Subject: [PATCH 02/28] TST Remove useless guard --- sklearn/metrics/tests/test_dist_metrics.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 8f1ddd662ca3f..e1273a900d90f 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -194,12 +194,6 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): @pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap]) def test_pdist_bool_metrics(metric, X_bool): D_scipy_pdist = cdist(X_bool, X_bool, metric) - # Based on https://github.com/scipy/scipy/pull/7373 - # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric - # was changed to return 0, instead of nan. - if metric == "jaccard" and sp_version < parse_version("1.2.0"): - D_scipy_pdist[np.isnan(D_scipy_pdist)] = 0 - dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool) assert_allclose(D_sklearn, D_scipy_pdist) From d39d2b2486b39a7cefe07a2ea0caeab329614edc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 15 Jun 2022 10:57:52 +0200 Subject: [PATCH 03/28] TST Skip JaccardDistance on 32bit architecture --- sklearn/metrics/_dist_metrics.pyx.tp | 6 +++--- sklearn/metrics/tests/test_dist_metrics.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 51108a18500ca..154c242eb66b2 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -1709,19 +1709,19 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): """ cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_tt = 0, nnz = 0 + cdef int tf1, tf2, n_eq = 0, nnz = 0 cdef cnp.intp_t j for j in range(size): tf1 = x1[j] != 0 tf2 = x2[j] != 0 nnz += (tf1 or tf2) - n_tt += (tf1 and tf2) + n_eq += (tf1 and tf2) # Based on https://github.com/scipy/scipy/pull/7373 # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric # was changed to return 0, instead of nan. if nnz == 0: return 0 - return (nnz - n_tt) * 1.0 / nnz + return (nnz - n_eq) * 1.0 / nnz cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index e1273a900d90f..d939dd431a01a 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -17,7 +17,7 @@ DistanceMetric32, ) -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, _IS_32BIT from sklearn.utils._testing import create_memmap_backed_data from sklearn.utils.fixes import sp_version, parse_version @@ -118,6 +118,9 @@ def test_cdist(metric_param_grid, X, Y): def test_cdist_bool_metric(metric, X_bool, Y_bool): D_scipy_cdist = cdist(X_bool, Y_bool, metric) + if metric == "jaccard" and _IS_32BIT: + pytest.skip("Jaccard Distance on 32bit architecture is unstable.") + dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool, Y_bool) assert_allclose(D_sklearn, D_scipy_cdist) From 011e2a2ad072908215a827b59058745dc8ce2c1f Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 16 Jun 2022 16:49:38 +0200 Subject: [PATCH 04/28] MAINT Define dtype alias for sparse matrices indices --- sklearn/metrics/_dist_metrics.pxd.tp | 38 ++-- sklearn/metrics/_dist_metrics.pyx.tp | 312 +++++++++++++-------------- sklearn/utils/_typedefs.pxd | 10 + sklearn/utils/_typedefs.pyx | 3 + 4 files changed, 188 insertions(+), 175 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 2ba4545dc02fc..ba257e89f02d4 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -28,7 +28,7 @@ implementation_specific_values = [ cimport numpy as cnp from libc.math cimport sqrt, exp -from ..utils._typedefs cimport DTYPE_t, ITYPE_t +from ..utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t {{for name_suffix, DTYPE_t, DTYPE in implementation_specific_values}} @@ -93,25 +93,25 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1 cdef DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1 @@ -128,19 +128,19 @@ cdef class DistanceMetric{{name_suffix}}: cdef int csr_pdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, - const cnp.int32_t[:] x1_indptr, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1 cdef int csr_cdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, - const cnp.int32_t[:] x1_indptr, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indptr, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t[:] x2_indptr, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1 diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 154c242eb66b2..d0a1328fc3e1e 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -385,13 +385,13 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: """Compute the distance between vectors x1 and x2 represented @@ -407,13 +407,13 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: """Compute the distance between vectors x1 and x2 represented @@ -444,8 +444,8 @@ cdef class DistanceMetric{{name_suffix}}: cdef int csr_pdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, - const cnp.int32_t[:] x1_indptr, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1: @@ -477,11 +477,11 @@ cdef class DistanceMetric{{name_suffix}}: cdef int csr_cdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, - const cnp.int32_t[:] x1_indptr, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indptr, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t[:] x2_indptr, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1: @@ -677,13 +677,13 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -728,13 +728,13 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: return sqrt(self.csr_rdist( @@ -794,13 +794,13 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -846,13 +846,13 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: return sqrt(self.csr_rdist( @@ -889,13 +889,13 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -968,13 +968,13 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1096,13 +1096,13 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1171,13 +1171,13 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: return pow( @@ -1266,13 +1266,13 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1313,13 +1313,13 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: return pow( @@ -1409,13 +1409,13 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_rdist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1462,13 +1462,13 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: return sqrt(self.csr_rdist( @@ -1507,13 +1507,13 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1577,13 +1577,13 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1646,13 +1646,13 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1725,13 +1725,13 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1801,13 +1801,13 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1872,13 +1872,13 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -1946,13 +1946,13 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -2017,13 +2017,13 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -2087,13 +2087,13 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -2152,13 +2152,13 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: @@ -2223,13 +2223,13 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, const {{DTYPE_t}}[:] x1_data, - const cnp.int32_t[:] x1_indices, + const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{DTYPE_t}}[:] x2_data, - const cnp.int32_t[:] x2_indices, - const cnp.int32_t x1_start, - const cnp.int32_t x1_end, - const cnp.int32_t x2_start, - const cnp.int32_t x2_end, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: diff --git a/sklearn/utils/_typedefs.pxd b/sklearn/utils/_typedefs.pxd index ee0c8ca3b57e9..9c2db8cf32c4a 100644 --- a/sklearn/utils/_typedefs.pxd +++ b/sklearn/utils/_typedefs.pxd @@ -15,3 +15,13 @@ cdef enum: ctypedef cnp.intp_t ITYPE_t # WARNING: should match ITYPE in typedefs.pyx ctypedef cnp.int32_t INT32TYPE_t # WARNING: should match INT32TYPE in typedefs.pyx ctypedef cnp.int64_t INT64TYPE_t # WARNING: should match INT32TYPE in typedefs.pyx + +# scipy matrices indices dtype (namely for indptr and indices arrays) +# +# Note that indices might need to be represented as cnp.int64_t. +# Currently, we use Cython classes which do not handle fused types +# so we hardcode this type to cnp.int32_t, supporting all but edge +# cases. +# +# TODO: support cnp.int64_t for this case +ctypedef cnp.int32_t SPARSE_INDEX_TYPE_t diff --git a/sklearn/utils/_typedefs.pyx b/sklearn/utils/_typedefs.pyx index 09e5a6a44944a..839aa4e5fde83 100644 --- a/sklearn/utils/_typedefs.pyx +++ b/sklearn/utils/_typedefs.pyx @@ -19,6 +19,9 @@ INT64TYPE = np.int64 # WARNING: this should match INT64TYPE_t in typedefs.pxd #DTYPE = np.asarray(ddummy_view).dtype DTYPE = np.float64 # WARNING: this should match DTYPE_t in typedefs.pxd +# WARNING: this must match SPARSE_INDEX_TYPE_t in typedefs.pxd +SPARSE_INDEX_TYPE = np.float32 + # some handy constants cdef DTYPE_t INF = np.inf cdef DTYPE_t PI = np.pi From a579630f99a9c13cdace51e86fc281625bd37368 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 16 Jun 2022 17:16:51 +0200 Subject: [PATCH 05/28] MAINT Do not shadow dtype names in Tempita templating --- sklearn/metrics/_dist_metrics.pxd.tp | 56 ++-- sklearn/metrics/_dist_metrics.pyx.tp | 373 +++++++++++++++++---------- 2 files changed, 261 insertions(+), 168 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index ba257e89f02d4..af3b84cac9d4a 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -3,7 +3,7 @@ implementation_specific_values = [ # Values are the following ones: # - # name_suffix, DTYPE_t, DTYPE + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE # # On the first hand, an empty string is used for `name_suffix` # for the float64 case as to still be able to expose the original @@ -30,7 +30,7 @@ from libc.math cimport sqrt, exp from ..utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t -{{for name_suffix, DTYPE_t, DTYPE in implementation_specific_values}} +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} ###################################################################### # Inline distance functions @@ -38,8 +38,8 @@ from ..utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t # We use these for the default (euclidean) case so that they can be # inlined. This leads to faster computation for the most common case cdef inline DTYPE_t euclidean_dist{{name_suffix}}( - const {{DTYPE_t}}* x1, - const {{DTYPE_t}}* x2, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: cdef DTYPE_t tmp, d=0 @@ -51,8 +51,8 @@ cdef inline DTYPE_t euclidean_dist{{name_suffix}}( cdef inline DTYPE_t euclidean_rdist{{name_suffix}}( - const {{DTYPE_t}}* x1, - const {{DTYPE_t}}* x2, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: cdef DTYPE_t tmp, d=0 @@ -63,11 +63,11 @@ cdef inline DTYPE_t euclidean_rdist{{name_suffix}}( return d -cdef inline DTYPE_t euclidean_dist_to_rdist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1: +cdef inline DTYPE_t euclidean_dist_to_rdist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) nogil except -1: return dist * dist -cdef inline DTYPE_t euclidean_rdist_to_dist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1: +cdef inline DTYPE_t euclidean_rdist_to_dist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) nogil except -1: return sqrt(dist) @@ -78,23 +78,29 @@ cdef class DistanceMetric{{name_suffix}}: # we must define them here so that cython's limited polymorphism will work. # Because we don't expect to instantiate a lot of these objects, the # extra memory overhead of this setup should not be an issue. - cdef {{DTYPE_t}} p + cdef {{INPUT_DTYPE_t}} p cdef DTYPE_t[::1] vec cdef DTYPE_t[:, ::1] mat cdef ITYPE_t size cdef object func cdef object kwargs - cdef DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1 + cdef DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1 - cdef DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1 + cdef DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1 cdef DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -104,9 +110,9 @@ cdef class DistanceMetric{{name_suffix}}: ) nogil except -1 cdef DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -116,18 +122,18 @@ cdef class DistanceMetric{{name_suffix}}: ) nogil except -1 cdef int pdist(self, - const {{DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] X, DTYPE_t[:, ::1] D, ) except -1 cdef int cdist(self, - const {{DTYPE_t}}[:, ::1] X, - const {{DTYPE_t}}[:, ::1] Y, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, DTYPE_t[:, ::1] D, ) except -1 cdef int csr_pdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, @@ -135,19 +141,19 @@ cdef class DistanceMetric{{name_suffix}}: ) nogil except -1 cdef int csr_cdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1 - cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1 + cdef DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1 - cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1 + cdef DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1 {{endfor}} diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index d0a1328fc3e1e..d85a369b0cb39 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -3,7 +3,7 @@ implementation_specific_values = [ # Values are the following ones: # - # name_suffix, DTYPE_t, DTYPE + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE # # # On the first hand, an empty string is used for `name_suffix` @@ -87,7 +87,7 @@ def get_valid_metric_ids(L): if (val.__name__ in L) or (val in L)] -{{for name_suffix, DTYPE_t, DTYPE in implementation_specific_values}} +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} ###################################################################### # metric mappings @@ -120,7 +120,7 @@ METRIC_MAPPING{{name_suffix}} = { 'pyfunc': PyFuncDistance{{name_suffix}}, } -cdef inline cnp.ndarray _buffer_to_ndarray{{name_suffix}}(const {{DTYPE_t}}* x, cnp.npy_intp n): +cdef inline cnp.ndarray _buffer_to_ndarray{{name_suffix}}(const {{INPUT_DTYPE_t}}* x, cnp.npy_intp n): # Wrap a memory buffer with an ndarray. Warning: this is not robust. # In particular, if x is deallocated before the returned array goes # out of scope, this could cause memory errors. Since there is not @@ -130,7 +130,7 @@ cdef inline cnp.ndarray _buffer_to_ndarray{{name_suffix}}(const {{DTYPE_t}}* x, return PyArray_SimpleNewFromData(1, &n, DTYPECODE, x) -cdef {{DTYPE_t}} INF{{name_suffix}} = np.inf +cdef {{INPUT_DTYPE_t}} INF{{name_suffix}} = np.inf ###################################################################### @@ -335,16 +335,22 @@ cdef class DistanceMetric{{name_suffix}}: """ return - cdef DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: """Compute the distance between vectors x1 and x2 This should be overridden in a base class. """ return -999 - cdef DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: """Compute the rank-preserving surrogate distance between vectors x1 and x2. This can optionally be overridden in a base class. @@ -357,7 +363,7 @@ cdef class DistanceMetric{{name_suffix}}: return self.dist(x1, x2, size) cdef int pdist(self, - const {{DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] X, DTYPE_t[:, ::1] D, ) except -1: """Compute the pairwise distances between points in X""" @@ -370,8 +376,8 @@ cdef class DistanceMetric{{name_suffix}}: cdef int cdist(self, - const {{DTYPE_t}}[:, ::1] X, - const {{DTYPE_t}}[:, ::1] Y, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, DTYPE_t[:, ::1] D, ) except -1: """Compute the cross-pairwise distances between arrays X and Y""" @@ -384,9 +390,9 @@ cdef class DistanceMetric{{name_suffix}}: return 0 cdef DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -406,9 +412,9 @@ cdef class DistanceMetric{{name_suffix}}: return -999 cdef DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -443,7 +449,7 @@ cdef class DistanceMetric{{name_suffix}}: ) cdef int csr_pdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, const ITYPE_t size, @@ -476,10 +482,10 @@ cdef class DistanceMetric{{name_suffix}}: return 0 cdef int csr_cdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t[:] x2_indptr, const ITYPE_t size, @@ -513,11 +519,11 @@ cdef class DistanceMetric{{name_suffix}}: ) return 0 - cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: """Convert the rank-preserving surrogate distance to the distance""" return rdist - cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: """Convert the distance to the rank-preserving surrogate distance""" return dist @@ -562,18 +568,18 @@ cdef class DistanceMetric{{name_suffix}}: return dist def _pairwise_dense(self, X, Y=None): - cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Xarr - cdef cnp.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Yarr + cdef cnp.ndarray[{{INPUT_DTYPE_t}}, ndim=2, mode='c'] Xarr + cdef cnp.ndarray[{{INPUT_DTYPE_t}}, ndim=2, mode='c'] Yarr cdef cnp.ndarray[DTYPE_t, ndim=2, mode='c'] Darr - Xarr = np.asarray(X, dtype={{DTYPE}}, order='C') + Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Xarr) if Y is None: Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), dtype=DTYPE, order='C') self.pdist(Xarr, Darr) else: - Yarr = np.asarray(Y, dtype={{DTYPE}}, order='C') + Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Yarr) Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), dtype=DTYPE, order='C') @@ -583,7 +589,7 @@ cdef class DistanceMetric{{name_suffix}}: def _pairwise_sparse(self, X, Y=None): X_csr = X.tocsr() n_X, size = X_csr.shape - X_data = np.asarray(X_csr.data, dtype={{DTYPE}}) + X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) X_indices = np.asarray(X_csr.indices, dtype=np.int32) X_indptr = np.asarray(X_csr.indptr, dtype=np.int32) @@ -599,7 +605,7 @@ cdef class DistanceMetric{{name_suffix}}: else: Y_csr = Y.tocsr() n_Y, _ = Y_csr.shape - Y_data = np.asarray(Y_csr.data, dtype={{DTYPE}}) + Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}}) Y_indices = np.asarray(Y_csr.indices, dtype=np.int32) Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32) @@ -655,18 +661,24 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = 2 - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return euclidean_dist{{name_suffix}}(x1, x2, size) - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return euclidean_rdist{{name_suffix}}(x1, x2, size) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return sqrt(rdist) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: return dist * dist def rdist_to_dist(self, rdist): @@ -676,9 +688,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return dist ** 2 cdef inline DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -727,9 +739,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -767,8 +779,11 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('SEuclidean dist: size of V does not match') - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t tmp, d=0 cdef cnp.intp_t j for j in range(size): @@ -776,14 +791,17 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (tmp * tmp / self.vec[j]) return d - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return sqrt(self.rdist(x1, x2, size)) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return sqrt(rdist) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: return dist * dist def rdist_to_dist(self, rdist): @@ -793,9 +811,9 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return dist ** 2 cdef inline DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -817,37 +835,37 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2] if ix1 == ix2: - unsquared = (x1_data[i1] - x2_data[i2]) + unsquared = (x1_data[i1] - x2_data[i2]) d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - unsquared = x1_data[i1] + unsquared = x1_data[i1] d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 else: - unsquared = x2_data[i2] + unsquared = x2_data[i2] d = d + (unsquared * unsquared) / self.vec[ix2] i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: ix2 = x2_indices[i2] - unsquared = x2_data[i2] + unsquared = x2_data[i2] d = d + (unsquared * unsquared) / self.vec[ix1] i2 = i2 + 1 else: while i1 < x1_end: ix1 = x1_indices[i1] - unsquared = x1_data[i1] + unsquared = x1_data[i1] d = d + (unsquared * unsquared) / self.vec[ix2] i1 = i1 + 1 return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -879,8 +897,11 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = 1 - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): @@ -888,9 +909,9 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -957,8 +978,11 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = INF{{name_suffix}} - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): @@ -967,9 +991,9 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1065,8 +1089,11 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): f"the number of features ({X.shape[1]}). " f"Currently len(w)={self.size}.") - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t d=0 cdef cnp.intp_t j cdef bint has_w = self.size > 0 @@ -1078,14 +1105,17 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (pow(fabs(x1[j] - x2[j]), self.p)) return d - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return pow(self.rdist(x1, x2, size), 1. / self.p) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return pow(rdist, 1. / self.p) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: return pow(dist, self.p) def rdist_to_dist(self, rdist): @@ -1095,9 +1125,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return dist ** self.p cdef inline DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1170,9 +1200,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1239,8 +1269,11 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise ValueError('WMinkowskiDistance dist: ' 'size of w does not match') - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t d = 0 cdef cnp.intp_t j @@ -1248,14 +1281,17 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p)) return d - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return pow(self.rdist(x1, x2, size), 1. / self.p) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return pow(rdist, 1. / self.p) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: return pow(dist, self.p) def rdist_to_dist(self, rdist): @@ -1265,9 +1301,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return dist ** self.p cdef inline DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1312,9 +1348,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1375,8 +1411,11 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('Mahalanobis dist: size of V does not match') - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t tmp, d = 0 cdef cnp.intp_t i, j @@ -1391,14 +1430,17 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += tmp * self.vec[i] return d - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return sqrt(self.rdist(x1, x2, size)) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return sqrt(rdist) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: return dist * dist def rdist_to_dist(self, rdist): @@ -1408,9 +1450,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return dist ** 2 cdef inline DTYPE_t csr_rdist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1461,9 +1503,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1495,8 +1537,11 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int n_unequal = 0 cdef cnp.intp_t j for j in range(size): @@ -1506,9 +1551,9 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1565,8 +1610,11 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t denom, d = 0 cdef cnp.intp_t j for j in range(size): @@ -1576,9 +1624,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1632,8 +1680,11 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t num = 0, denom = 0 cdef cnp.intp_t j for j in range(size): @@ -1645,9 +1696,9 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return 0.0 cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1707,8 +1758,11 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_eq = 0, nnz = 0 cdef cnp.intp_t j for j in range(size): @@ -1724,9 +1778,9 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (nnz - n_eq) * 1.0 / nnz cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1789,8 +1843,11 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / N """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_neq = 0 cdef cnp.intp_t j for j in range(size): @@ -1800,9 +1857,9 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return n_neq * 1. / size cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1859,8 +1916,11 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_neq = 0, n_tt = 0 cdef cnp.intp_t j for j in range(size): @@ -1871,9 +1931,9 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return n_neq / (2.0 * n_tt + n_neq) cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -1933,8 +1993,11 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 1 - N_TT / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_tt = 0, n_neq = 0 cdef cnp.intp_t j for j in range(size): @@ -1945,9 +2008,9 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (n_neq - n_tt + size) * 1.0 / (n_neq + size) cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2005,8 +2068,11 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_neq = 0 cdef cnp.intp_t j for j in range(size): @@ -2016,9 +2082,9 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (2.0 * n_neq) / (size + n_neq) cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2075,8 +2141,11 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N - N_TT) / N """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_tt = 0 cdef cnp.intp_t j for j in range(size): @@ -2086,9 +2155,9 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (size - n_tt) * 1. / size cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2140,8 +2209,11 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_neq = 0 cdef cnp.intp_t j for j in range(size): @@ -2151,9 +2223,9 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return (2.0 * n_neq) / (size + n_neq) cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2210,8 +2282,11 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT / 2 + N_FT + N_TF) """ - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef int tf1, tf2, n_tt = 0, n_neq = 0 cdef cnp.intp_t j for j in range(size): @@ -2222,9 +2297,9 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return n_neq / (0.5 * n_tt + n_neq) cdef inline DTYPE_t csr_dist(self, - const {{DTYPE_t}}[:] x1_data, + const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, - const {{DTYPE_t}}[:] x2_data, + const {{INPUT_DTYPE_t}}[:] x2_data, const SPARSE_INDEX_TYPE_t[:] x2_indices, const SPARSE_INDEX_TYPE_t x1_start, const SPARSE_INDEX_TYPE_t x1_end, @@ -2292,20 +2367,26 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise ValueError("Haversine distance only valid " "in 2 dimensions") - cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return 2 * asin(sqrt(self.rdist(x1, x2, size))) - cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1: + cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: return 2 * asin(sqrt(rdist)) - cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1: + cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: cdef DTYPE_t tmp = sin(0.5 * dist) return tmp * tmp @@ -2338,12 +2419,18 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The # only way to be back compatible is to inherit `dist` from the base class # without GIL and called an inline `_dist` which acquire GIL. - cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) nogil except -1: + cdef inline DTYPE_t dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) nogil except -1: return self._dist(x1, x2, size) - cdef inline DTYPE_t _dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2, - ITYPE_t size) except -1 with gil: + cdef inline DTYPE_t _dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + ITYPE_t size, + ) except -1 with gil: cdef cnp.ndarray x1arr cdef cnp.ndarray x2arr x1arr = _buffer_to_ndarray{{name_suffix}}(x1, size) From 98e9d21efc6d76f3a441e0bca5e31cc846de9b8c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 16 Jun 2022 17:46:40 +0200 Subject: [PATCH 06/28] fixup! MAINT Define dtype alias for sparse matrices indices --- sklearn/metrics/_dist_metrics.pyx.tp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index d85a369b0cb39..ea5e079c19133 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -86,6 +86,7 @@ def get_valid_metric_ids(L): return [key for (key, val) in METRIC_MAPPING.items() if (val.__name__ in L) or (val in L)] +from ..utils._typedefs import SPARSE_INDEX_TYPE {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -590,8 +591,8 @@ cdef class DistanceMetric{{name_suffix}}: X_csr = X.tocsr() n_X, size = X_csr.shape X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) - X_indices = np.asarray(X_csr.indices, dtype=np.int32) - X_indptr = np.asarray(X_csr.indptr, dtype=np.int32) + X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) if Y is None: Darr = np.zeros((n_X, n_X), dtype=DTYPE, order='C') @@ -606,8 +607,8 @@ cdef class DistanceMetric{{name_suffix}}: Y_csr = Y.tocsr() n_Y, _ = Y_csr.shape Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}}) - Y_indices = np.asarray(Y_csr.indices, dtype=np.int32) - Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32) + Y_indices = np.asarray(Y_csr.indices, dtype=SPARSE_INDEX_TYPE) + Y_indptr = np.asarray(Y_csr.indptr, dtype=SPARSE_INDEX_TYPE) Darr = np.zeros((n_X, n_Y), dtype=DTYPE, order='C') self.csr_cdist( From 8aa4e44791714d0a94ae353685a0ee021b15df58 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 16 Jun 2022 18:05:59 +0200 Subject: [PATCH 07/28] TST Use cdist and pdist appropriately --- sklearn/metrics/tests/test_dist_metrics.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index d939dd431a01a..ee62d1d1f53a4 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -8,7 +8,7 @@ import pytest import scipy.sparse as sp -from scipy.spatial.distance import cdist +from scipy.spatial.distance import cdist, pdist from sklearn.metrics import DistanceMetric from sklearn.metrics._dist_metrics import ( @@ -87,7 +87,7 @@ def test_cdist(metric_param_grid, X, Y): for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) rtol_dict = {} - if metric == "mahalanobis": + if metric == "mahalanobis" and X.dtype == np.float32: # Computation of mahalanobis differs between # the scipy and scikit-learn implementation. # Hence, we increase the relative tolerance. @@ -119,7 +119,7 @@ def test_cdist_bool_metric(metric, X_bool, Y_bool): D_scipy_cdist = cdist(X_bool, Y_bool, metric) if metric == "jaccard" and _IS_32BIT: - pytest.skip("Jaccard Distance on 32bit architecture is unstable.") + pytest.xfail("Jaccard Distance on 32bit architecture is unstable.") dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool, Y_bool) @@ -144,7 +144,7 @@ def test_pdist(metric_param_grid, X): for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) rtol_dict = {} - if metric == "mahalanobis": + if metric == "mahalanobis" and X.dtype == np.float32: # Computation of mahalanobis differs between # the scipy and scikit-learn implementation. # Hence, we increase the relative tolerance. @@ -159,9 +159,9 @@ def test_pdist(metric_param_grid, X): if sp_version >= parse_version("1.6.0"): ExceptionToAssert = DeprecationWarning with pytest.warns(ExceptionToAssert): - D_scipy_pdist = cdist(X, X, metric, **kwargs) + D_scipy_pdist = pdist(X, metric, **kwargs) else: - D_scipy_pdist = cdist(X, X, metric, **kwargs) + D_scipy_pdist = pdist(X, metric, **kwargs) dm = DistanceMetricInterface.get_metric(metric, **kwargs) D_sklearn = dm.pairwise(X) @@ -196,7 +196,7 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): @pytest.mark.parametrize("metric", BOOL_METRICS) @pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap]) def test_pdist_bool_metrics(metric, X_bool): - D_scipy_pdist = cdist(X_bool, X_bool, metric) + D_scipy_pdist = pdist(X_bool, metric) dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool) assert_allclose(D_sklearn, D_scipy_pdist) From 9edfa11625cf119b534b5878dc739560ee5d05db Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 09:49:19 +0200 Subject: [PATCH 08/28] DOC Improve comments Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index ea5e079c19133..3273a48d9708e 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -423,10 +423,9 @@ cdef class DistanceMetric{{name_suffix}}: const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: - """Compute the distance between vectors x1 and x2 represented - under the CSR format. + """Distance between rows of CSR matrices x1 and x2. - This can optionally be overridden in a base class. + This can optionally be overridden in a subclass. The rank-preserving surrogate distance is any measure that yields the same rank as the distance, but is more efficient to compute. For example, the @@ -456,8 +455,11 @@ cdef class DistanceMetric{{name_suffix}}: const ITYPE_t size, DTYPE_t[:, ::1] D, ) nogil except -1: - """Compute the pairwise distances between points in X - represented in the CSR format.""" + """Pairwise distances between rows in CSR matrix X. + + Note that this implementation is twice faster than csr_cdist(X, X) + because it leverages the symmetry of the problem. + """ cdef: ITYPE_t i1, i2 ITYPE_t n_x1 = x1_indptr.shape[0] - 1 From ee5c6bf598e1b1224c1225a5127eb6628e7602b7 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 09:46:58 +0200 Subject: [PATCH 09/28] Fixups --- sklearn/metrics/tests/test_dist_metrics.py | 12 ++++++++---- sklearn/utils/_typedefs.pyx | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index ee62d1d1f53a4..78e4a28ccbbd9 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -8,7 +8,7 @@ import pytest import scipy.sparse as sp -from scipy.spatial.distance import cdist, pdist +from scipy.spatial.distance import cdist from sklearn.metrics import DistanceMetric from sklearn.metrics._dist_metrics import ( @@ -91,6 +91,8 @@ def test_cdist(metric_param_grid, X, Y): # Computation of mahalanobis differs between # the scipy and scikit-learn implementation. # Hence, we increase the relative tolerance. + # TODO: Inspect slight numerical discrepancy + # with scipy rtol_dict = {"rtol": 1e-6} if metric == "wminkowski": @@ -148,6 +150,8 @@ def test_pdist(metric_param_grid, X): # Computation of mahalanobis differs between # the scipy and scikit-learn implementation. # Hence, we increase the relative tolerance. + # TODO: Inspect slight numerical discrepancy + # with scipy rtol_dict = {"rtol": 1e-6} if metric == "wminkowski": @@ -159,9 +163,9 @@ def test_pdist(metric_param_grid, X): if sp_version >= parse_version("1.6.0"): ExceptionToAssert = DeprecationWarning with pytest.warns(ExceptionToAssert): - D_scipy_pdist = pdist(X, metric, **kwargs) + D_scipy_pdist = cdist(X, X, metric, **kwargs) else: - D_scipy_pdist = pdist(X, metric, **kwargs) + D_scipy_pdist = cdist(X, X, metric, **kwargs) dm = DistanceMetricInterface.get_metric(metric, **kwargs) D_sklearn = dm.pairwise(X) @@ -196,7 +200,7 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): @pytest.mark.parametrize("metric", BOOL_METRICS) @pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap]) def test_pdist_bool_metrics(metric, X_bool): - D_scipy_pdist = pdist(X_bool, metric) + D_scipy_pdist = cdist(X_bool, X_bool, metric) dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool) assert_allclose(D_sklearn, D_scipy_pdist) diff --git a/sklearn/utils/_typedefs.pyx b/sklearn/utils/_typedefs.pyx index 839aa4e5fde83..49d0e46101b4f 100644 --- a/sklearn/utils/_typedefs.pyx +++ b/sklearn/utils/_typedefs.pyx @@ -20,7 +20,7 @@ INT64TYPE = np.int64 # WARNING: this should match INT64TYPE_t in typedefs.pxd DTYPE = np.float64 # WARNING: this should match DTYPE_t in typedefs.pxd # WARNING: this must match SPARSE_INDEX_TYPE_t in typedefs.pxd -SPARSE_INDEX_TYPE = np.float32 +SPARSE_INDEX_TYPE = np.int32 # some handy constants cdef DTYPE_t INF = np.inf From bf5eb597a5783cc8c7a90615b235949dd4ca8ea6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 11:05:23 +0200 Subject: [PATCH 10/28] MAINT Wrap of indptr values to support sparse-dense This is kind of an hack for now. IMO, it would be better to use a flatiter on a view if possible. See discussions on: https://groups.google.com/g/cython-users/c/MR4xWCvUKHU Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 286 +++++++++++++++++---- sklearn/metrics/tests/test_dist_metrics.py | 39 ++- 2 files changed, 269 insertions(+), 56 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 3273a48d9708e..739a16d09b8d2 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -456,7 +456,7 @@ cdef class DistanceMetric{{name_suffix}}: DTYPE_t[:, ::1] D, ) nogil except -1: """Pairwise distances between rows in CSR matrix X. - + Note that this implementation is twice faster than csr_cdist(X, X) because it leverages the symmetry of the problem. """ @@ -570,34 +570,34 @@ cdef class DistanceMetric{{name_suffix}}: """ return dist - def _pairwise_dense(self, X, Y=None): + def _pairwise_dense_dense(self, X, Y): cdef cnp.ndarray[{{INPUT_DTYPE_t}}, ndim=2, mode='c'] Xarr cdef cnp.ndarray[{{INPUT_DTYPE_t}}, ndim=2, mode='c'] Yarr cdef cnp.ndarray[DTYPE_t, ndim=2, mode='c'] Darr Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Xarr) - if Y is None: - Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), + if X is Y: + Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype=DTYPE, order='C') self.pdist(Xarr, Darr) else: Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Yarr) - Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), + Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype=DTYPE, order='C') self.cdist(Xarr, Yarr, Darr) return Darr - def _pairwise_sparse(self, X, Y=None): + def _pairwise_sparse_sparse(self, X, Y): X_csr = X.tocsr() n_X, size = X_csr.shape X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) - if Y is None: - Darr = np.zeros((n_X, n_X), dtype=DTYPE, order='C') + if X is Y: + Darr = np.empty((n_X, n_X), dtype=DTYPE, order='C') self.csr_pdist( x1_data=X_data, x1_indices=X_indices, @@ -612,7 +612,7 @@ cdef class DistanceMetric{{name_suffix}}: Y_indices = np.asarray(Y_csr.indices, dtype=SPARSE_INDEX_TYPE) Y_indptr = np.asarray(Y_csr.indptr, dtype=SPARSE_INDEX_TYPE) - Darr = np.zeros((n_X, n_Y), dtype=DTYPE, order='C') + Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.csr_cdist( x1_data=X_data, x1_indices=X_indices, @@ -625,6 +625,31 @@ cdef class DistanceMetric{{name_suffix}}: ) return Darr + def _pairwise_sparse_dense(self, X, Y): + X_csr = X.tocsr() + n_X, size = X_csr.shape + X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) + X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) + + n_Y, _ = Y.shape + Y_data = Y.reshape(-1) + Y_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) + Y_indptr = np.arange(stop=size*(n_Y + 1), step=size, dtype=SPARSE_INDEX_TYPE) + + Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') + self.csr_cdist( + x1_data=X_data, + x1_indices=X_indices, + x1_indptr=X_indptr, + x2_data=Y_data, + x2_indices=Y_indices, + x2_indptr=Y_indptr, + size=size, + D=Darr, + ) + return Darr + def pairwise(self, X, Y=None): """Compute the pairwise distances between X and Y @@ -646,11 +671,20 @@ cdef class DistanceMetric{{name_suffix}}: The shape (Nx, Ny) array of pairwise distances between points in X and Y. """ - if not issparse(X) and (Y is None or not issparse(Y)): - return self._pairwise_dense(X, Y) - if issparse(X) and (Y is None or issparse(Y)): - return self._pairwise_sparse(X, Y) + Y = X if Y is None else Y + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + + if not X_is_sparse and not Y_is_sparse: + return self._pairwise_dense_dense(X, Y) + if X_is_sparse and Y_is_sparse: + return self._pairwise_sparse_sparse(X, Y) + if X_is_sparse and not Y_is_sparse: + return self._pairwise_sparse_dense(X, Y) + if not X_is_sparse and Y_is_sparse: + # Swapping argument and transposing the result + return self._pairwise_sparse_dense(Y, X).T #------------------------------------------------------------ # Euclidean Distance @@ -706,13 +740,21 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 DTYPE_t unsquared = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: unsquared = (x1_data[i1] - x2_data[i2]) @@ -829,13 +871,21 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 DTYPE_t unsquared = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: unsquared = (x1_data[i1] - x2_data[i2]) @@ -927,12 +977,20 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d = d + fabs(x1_data[i1] - x2_data[i2]) @@ -1009,12 +1067,20 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) @@ -1143,14 +1209,22 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 bint has_w = self.size > 0 if has_w: while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d = d + (self.vec[ix1] * pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) @@ -1175,8 +1249,14 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d else: while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d = d + (pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) @@ -1319,12 +1399,20 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d = d + pow(self.vec[ix1] * fabs(x1_data[i1] - x2_data[i2]), self.p) @@ -1468,12 +1556,20 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t tmp, d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: self.vec[ix1] = x1_data[i1] - x2_data[i2] @@ -1569,12 +1665,20 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d += (x1_data[i1] != x2_data[i2]) @@ -1642,12 +1746,20 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: d += fabs(x1_data[i1] - x2_data[i2]) / (fabs(x1_data[i1]) + fabs(x2_data[i2])) @@ -1714,13 +1826,21 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] DTYPE_t num = 0.0 DTYPE_t denom = 0.0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: num += fabs(x1_data[i1] - x2_data[i2]) @@ -1796,12 +1916,20 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, nnz = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -1875,12 +2003,20 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: tf1 = x1_data[i1] != 0 @@ -1949,12 +2085,20 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2026,12 +2170,20 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2100,12 +2252,20 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2173,12 +2333,20 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2241,12 +2409,20 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 @@ -2315,12 +2491,20 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cnp.npy_intp ix1, ix2 cnp.npy_intp i1 = x1_start cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - ix1 = x1_indices[i1] - ix2 = x2_indices[i2] + # In the case of dense vectors, indices are repeated in the normal + # representation for all the rows, replicating information. + # Hence indices can be stored once in a buffer of `n_features` elements + # instead of being stored in a buffer of + # `n_features × n_samples` elements with repetition. + # To support both representations, we wrap `indptr` values. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] tf1 = x1_data[i1] != 0 tf2 = x2_data[i2] != 0 diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 78e4a28ccbbd9..6528ed5f381f5 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -106,11 +106,20 @@ def test_cdist(metric_param_grid, X, Y): D_scipy_cdist = cdist(X, Y, metric, **kwargs) dm = DistanceMetricInterface.get_metric(metric, **kwargs) + + # DistanceMetric.pairwise must be consistent + # on all combinations of format in {sparse, dense}². D_sklearn = dm.pairwise(X, Y) assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) - D_sklearn_csr = dm.pairwise(X_csr, Y_csr) - assert_allclose(D_sklearn_csr, D_scipy_cdist, **rtol_dict) + D_sklearn = dm.pairwise(X_csr, Y_csr) + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) + + D_sklearn = dm.pairwise(X_csr, Y) + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) + + D_sklearn = dm.pairwise(X, Y_csr) + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) @pytest.mark.parametrize("metric", BOOL_METRICS) @@ -127,9 +136,22 @@ def test_cdist_bool_metric(metric, X_bool, Y_bool): D_sklearn = dm.pairwise(X_bool, Y_bool) assert_allclose(D_sklearn, D_scipy_cdist) - X_bool_csr, Y_csr = sp.csr_matrix(X_bool), sp.csr_matrix(Y_bool) - D_sklearn_csr = dm.pairwise(X_bool_csr, Y_csr) - assert_allclose(D_sklearn_csr, D_scipy_cdist) + # DistanceMetric.pairwise must be consistent + # on all combinations of format in {sparse, dense}². + X_bool_csr, Y_bool_csr = sp.csr_matrix(X_bool), sp.csr_matrix(Y_bool) + + D_sklearn = dm.pairwise(X_bool, Y_bool) + assert_allclose(D_sklearn, D_scipy_cdist) + + D_sklearn = dm.pairwise(X_bool_csr, Y_bool_csr) + assert_allclose(D_sklearn, D_scipy_cdist) + + # TODO: inspect failures on Boolean DistanceMetric + # D_sklearn = dm.pairwise(X_bool, Y_bool_csr) + # assert_allclose(D_sklearn, D_scipy_cdist) + + # D_sklearn = dm.pairwise(X_bool_csr, Y_bool) + # assert_allclose(D_sklearn, D_scipy_cdist) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @@ -174,6 +196,9 @@ def test_pdist(metric_param_grid, X): D_sklearn_csr = dm.pairwise(X_csr) assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) + D_sklearn_csr = dm.pairwise(X_csr, X_csr) + assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) + # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @@ -205,6 +230,10 @@ def test_pdist_bool_metrics(metric, X_bool): D_sklearn = dm.pairwise(X_bool) assert_allclose(D_sklearn, D_scipy_pdist) + X_bool_csr = sp.csr_matrix(X_bool) + D_sklearn = dm.pairwise(X_bool_csr) + assert_allclose(D_sklearn, D_scipy_pdist) + # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") From 92b8a6c8d26c1059c4af1c5101e96823fe56d114 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 14:23:07 +0200 Subject: [PATCH 11/28] Apply review comments Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 168 +++++++++++---------------- sklearn/utils/_typedefs.pxd | 1 + 2 files changed, 66 insertions(+), 103 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 739a16d09b8d2..2945a99290386 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -632,10 +632,23 @@ cdef class DistanceMetric{{name_suffix}}: X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) + # To avoid introducing redundant implementations for the CSR × dense array + # case, we wrap the dense array into a fake CSR datastructure and leverage + # the existing code for the CSR × CSR case. + # The true CSR representation of a dense array would require allocating + # a Y_indices matrix of shape (n_samples, n_features) with repeated + # contiguous integers from 0 to n_features - 1 on each row which would + # be very wasteful from a memory point of view. Instead we only allocate + # a single row and adapt the CSR × CSR routines to use a modulo operation + # when accessing Y_indices in order to achieve the same result without having + # to materialize the indices repetition explicitly. + n_Y, _ = Y.shape Y_data = Y.reshape(-1) Y_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) - Y_indptr = np.arange(stop=size*(n_Y + 1), step=size, dtype=SPARSE_INDEX_TYPE) + Y_indptr = np.arange( + start=0, stop=size * (n_Y + 1), step=size, dtype=SPARSE_INDEX_TYPE + ) Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') self.csr_cdist( @@ -747,12 +760,9 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t unsquared = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -878,12 +888,9 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t unsquared = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -983,12 +990,9 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1073,12 +1077,9 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1405,12 +1406,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1562,12 +1560,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t tmp, d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1671,12 +1666,9 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1752,12 +1744,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t d = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1833,12 +1822,9 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): DTYPE_t denom = 0.0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1922,12 +1908,9 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_tt = 0, nnz = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2009,12 +1992,9 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2091,12 +2071,9 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2176,12 +2153,9 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2258,12 +2232,9 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2339,12 +2310,9 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_tt = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2415,12 +2383,9 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -2497,12 +2462,9 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ITYPE_t tf1, tf2, n_tt = 0, n_neq = 0 while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] diff --git a/sklearn/utils/_typedefs.pxd b/sklearn/utils/_typedefs.pxd index 9c2db8cf32c4a..a6e390705496b 100644 --- a/sklearn/utils/_typedefs.pxd +++ b/sklearn/utils/_typedefs.pxd @@ -24,4 +24,5 @@ ctypedef cnp.int64_t INT64TYPE_t # WARNING: should match INT32TYPE in typedefs. # cases. # # TODO: support cnp.int64_t for this case +# See: https://github.com/scikit-learn/scikit-learn/issues/23653 ctypedef cnp.int32_t SPARSE_INDEX_TYPE_t From dc6f8cf70ec138ea3dbc32337ba072a65e6abba1 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 17 Jun 2022 15:52:28 +0200 Subject: [PATCH 12/28] More interesting boolean data for tests --- sklearn/metrics/tests/test_dist_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 6528ed5f381f5..c97114117c72f 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -38,8 +38,8 @@ def dist_func(x1, x2, p): [X_mmap, Y_mmap] = create_memmap_backed_data([X64, Y64]) # make boolean arrays: ones and zeros -X_bool = X64.round(0) -Y_bool = Y64.round(0) +X_bool = (X64 < 0.3).astype(np.float64) # quite sparse +Y_bool = (Y64 < 0.7).astype(np.float64) # not too sparse [X_bool_mmap, Y_bool_mmap] = create_memmap_backed_data([X_bool, Y_bool]) From bb06f592a3b8ac3e7d988a75a1ad24ae60d7e576 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 15:49:38 +0200 Subject: [PATCH 13/28] FIX Various corrections --- sklearn/metrics/_dist_metrics.pyx.tp | 38 +++++++++------------- sklearn/metrics/tests/test_dist_metrics.py | 4 +-- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 2945a99290386..5ea2a836b756b 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -910,15 +910,15 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2] + ix2 = x2_indices[i2 % len_x2_indices] unsquared = x2_data[i2] - d = d + (unsquared * unsquared) / self.vec[ix1] + d = d + (unsquared * unsquared) / self.vec[ix2] i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1] + ix1 = x1_indices[i1 % len_x1_indices] unsquared = x1_data[i1] - d = d + (unsquared * unsquared) / self.vec[ix2] + d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 return d @@ -1218,12 +1218,9 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if has_w: while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1240,22 +1237,21 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: + ix2 = x2_indices[i2 % len_x2_indices] d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: + ix1 = x1_indices[i1 % len_x1_indices] d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d else: while i1 < x1_end and i2 < x2_end: - # In the case of dense vectors, indices are repeated in the normal - # representation for all the rows, replicating information. - # Hence indices can be stored once in a buffer of `n_features` elements - # instead of being stored in a buffer of - # `n_features × n_samples` elements with repetition. - # To support both representations, we wrap `indptr` values. + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. ix1 = x1_indices[i1 % len_x1_indices] ix2 = x2_indices[i2 % len_x2_indices] @@ -1272,12 +1268,10 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2] d = d + (pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1] d = d + (pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 @@ -1425,12 +1419,12 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2] + ix2 = x2_indices[i2 % len_x2_indices] d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1] + ix1 = x1_indices[i1 % len_x1_indices] d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 @@ -1579,12 +1573,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: - ix2 = x2_indices[i2] + ix2 = x2_indices[i2 % len_x2_indices] self.vec[ix2] = - x2_data[i2] i2 = i2 + 1 else: while i1 < x1_end: - ix1 = x1_indices[i1] + ix1 = x1_indices[i1 % len_x1_indices] self.vec[ix1] = x1_data[i1] i1 = i1 + 1 diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index c97114117c72f..ec46db8fc1596 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -3,8 +3,6 @@ import copy import numpy as np -from sklearn.utils._testing import assert_allclose - import pytest import scipy.sparse as sp @@ -18,7 +16,7 @@ ) from sklearn.utils import check_random_state, _IS_32BIT -from sklearn.utils._testing import create_memmap_backed_data +from sklearn.utils._testing import assert_allclose, create_memmap_backed_data from sklearn.utils.fixes import sp_version, parse_version From a5eb20dfe1de36fed91ddf398c9760224c5661f3 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 16:45:11 +0200 Subject: [PATCH 14/28] FIX Make Jaccard, Hamming and Hashing robust to explicit zeros Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 32 +++++++++++++++------- sklearn/metrics/tests/test_dist_metrics.py | 9 +++--- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 5ea2a836b756b..490da7d9837c0 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -406,7 +406,12 @@ cdef class DistanceMetric{{name_suffix}}: This must be overridden in a base class. - Note that we pass all the parameter as to not use memoryview slicing + Notes + ----- + The implementation of this method in subclasses must be robust to the + presence of explicit zeros in the CSR representation. + + All the parameters are passed as to not use memoryview slicing because it is currently known to slow down execution as it takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 """ @@ -432,7 +437,12 @@ cdef class DistanceMetric{{name_suffix}}: rank-preserving surrogate distance of the Euclidean metric is the squared-euclidean distance. - Note that we pass all the parameter as to not use memoryview slicing + Notes + ----- + The implementation of this method in subclasses must be robust to the + presence of explicit zeros in the CSR representation. + + All the parameters are passed as to not use memoryview slicing because it is currently known to slow down execution as it takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 """ @@ -1671,19 +1681,19 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d += 1 + d += (x1_data[i1] != 0) i1 = i1 + 1 else: - d += 1 + d += (x2_data[i2] != 0) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d += 1 + d += (x2_data[i2] != 0) i2 = i2 + 1 else: while i1 < x1_end: - d += 1 + d += (x1_data[i1] != 0) i1 = i1 + 1 d /= size @@ -1925,10 +1935,12 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if i1 == x1_end: while i2 < x2_end: + tf2 = x2_data[i2] != 0 nnz += tf2 i2 = i2 + 1 else: while i1 < x1_end: + tf1 = x1_data[i1] != 0 nnz += tf1 i1 = i1 + 1 @@ -1999,19 +2011,19 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += (x1_data[i1] != 0) i1 = i1 + 1 else: - n_neq += 1 + n_neq += (x2_data[i2] != 0) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + n_neq += (x2_data[i2] != 0) i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + n_neq += (x1_data[i1] != 0) i1 = i1 + 1 return n_neq * 1.0 / size diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index ec46db8fc1596..07e248431c253 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -144,12 +144,11 @@ def test_cdist_bool_metric(metric, X_bool, Y_bool): D_sklearn = dm.pairwise(X_bool_csr, Y_bool_csr) assert_allclose(D_sklearn, D_scipy_cdist) - # TODO: inspect failures on Boolean DistanceMetric - # D_sklearn = dm.pairwise(X_bool, Y_bool_csr) - # assert_allclose(D_sklearn, D_scipy_cdist) + D_sklearn = dm.pairwise(X_bool, Y_bool_csr) + assert_allclose(D_sklearn, D_scipy_cdist) - # D_sklearn = dm.pairwise(X_bool_csr, Y_bool) - # assert_allclose(D_sklearn, D_scipy_cdist) + D_sklearn = dm.pairwise(X_bool_csr, Y_bool) + assert_allclose(D_sklearn, D_scipy_cdist) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed From 19edf11a1d0b001dc00b9e70abedac19836daee4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 16:58:14 +0200 Subject: [PATCH 15/28] FIX Make the other boolean DistanceMetric also robust to explicit zeros Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 50 +++++++++++++++++----------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 490da7d9837c0..efdf02d4b34ba 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -2092,19 +2092,21 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += tf1 i1 = i1 + 1 else: - n_neq += 1 + n_neq += tf2 i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + tf2 = x2_data[i2] != 0 + n_neq += tf2 i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + tf1 = x1_data[i1] != 0 + n_neq += tf1 i1 = i1 + 1 return n_neq / (2.0 * n_tt + n_neq) @@ -2174,19 +2176,21 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += tf1 i1 = i1 + 1 else: - n_neq += 1 + n_neq += tf2 i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + tf2 = x2_data[i2] != 0 + n_neq += tf2 i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + tf1 = x1_data[i1] != 0 + n_neq += tf1 i1 = i1 + 1 return (n_neq - n_tt + size) * 1.0 / (n_neq + size) @@ -2252,19 +2256,21 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += tf1 i1 = i1 + 1 else: - n_neq += 1 + n_neq += tf2 i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + tf2 = x2_data[i2] != 0 + n_neq += tf2 i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + tf1 = x1_data[i1] != 0 + n_neq += tf1 i1 = i1 + 1 return (2.0 * n_neq) / (size + n_neq) @@ -2403,19 +2409,21 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += tf1 i1 = i1 + 1 else: - n_neq += 1 + n_neq += tf2 i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + tf2 = x2_data[i2] != 0 + n_neq += tf2 i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + tf1 = x1_data[i1] != 0 + n_neq += tf1 i1 = i1 + 1 return (2.0 * n_neq) / (size + n_neq) @@ -2483,19 +2491,21 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - n_neq += 1 + n_neq += tf1 i1 = i1 + 1 else: - n_neq += 1 + n_neq += tf2 i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - n_neq += 1 + tf2 = x2_data[i2] != 0 + n_neq += tf2 i2 = i2 + 1 else: while i1 < x1_end: - n_neq += 1 + tf1 = x1_data[i1] != 0 + n_neq += tf1 i1 = i1 + 1 return n_neq / (0.5 * n_tt + n_neq) From de8680278e94d8090024c9a85b79e720e24e04cd Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 17:01:20 +0200 Subject: [PATCH 16/28] TST Remove xfail for Jaccard on 32bit arch. --- sklearn/metrics/tests/test_dist_metrics.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 07e248431c253..68b4f36a7d1c9 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -15,7 +15,7 @@ DistanceMetric32, ) -from sklearn.utils import check_random_state, _IS_32BIT +from sklearn.utils import check_random_state from sklearn.utils._testing import assert_allclose, create_memmap_backed_data from sklearn.utils.fixes import sp_version, parse_version @@ -127,9 +127,6 @@ def test_cdist(metric_param_grid, X, Y): def test_cdist_bool_metric(metric, X_bool, Y_bool): D_scipy_cdist = cdist(X_bool, Y_bool, metric) - if metric == "jaccard" and _IS_32BIT: - pytest.xfail("Jaccard Distance on 32bit architecture is unstable.") - dm = DistanceMetric.get_metric(metric) D_sklearn = dm.pairwise(X_bool, Y_bool) assert_allclose(D_sklearn, D_scipy_cdist) From bb920cfe6bbf21d9cdfbd944df547e1406594855 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Jun 2022 18:18:27 +0200 Subject: [PATCH 17/28] Cast to np.float64_t where appropriate --- sklearn/metrics/_dist_metrics.pxd.tp | 2 +- sklearn/metrics/_dist_metrics.pyx.tp | 127 +++++++++++++++------------ 2 files changed, 70 insertions(+), 59 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index af3b84cac9d4a..8ae0190e6c478 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -78,7 +78,7 @@ cdef class DistanceMetric{{name_suffix}}: # we must define them here so that cython's limited polymorphism will work. # Because we don't expect to instantiate a lot of these objects, the # extra memory overhead of this setup should not be an issue. - cdef {{INPUT_DTYPE_t}} p + cdef DTYPE_t p cdef DTYPE_t[::1] vec cdef DTYPE_t[:, ::1] mat cdef ITYPE_t size diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index efdf02d4b34ba..094ef38cb240d 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -588,14 +588,12 @@ cdef class DistanceMetric{{name_suffix}}: Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Xarr) if X is Y: - Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), - dtype=DTYPE, order='C') + Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype=DTYPE, order='C') self.pdist(Xarr, Darr) else: Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Yarr) - Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), - dtype=DTYPE, order='C') + Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype=DTYPE, order='C') self.cdist(Xarr, Yarr, Darr) return Darr @@ -777,7 +775,7 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - unsquared = (x1_data[i1] - x2_data[i2]) + unsquared = ((x1_data[i1]) - (x2_data[i2])) d = d + (unsquared * unsquared) i1 = i1 + 1 i2 = i2 + 1 @@ -852,8 +850,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t tmp, d=0 cdef cnp.intp_t j for j in range(size): - tmp = (x1[j] - x2[j]) - d += (tmp * tmp / self.vec[j]) + tmp = ((x1[j]) - (x2[j])) + d += (tmp * tmp / self.vec[j]) return d cdef inline DTYPE_t dist(self, @@ -905,7 +903,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - unsquared = (x1_data[i1] - x2_data[i2]) + unsquared = ((x1_data[i1]) - (x2_data[i2])) d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 i2 = i2 + 1 @@ -975,7 +973,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d += fabs(x1[j] - x2[j]) + d += fabs((x1[j]) - (x2[j])) return d cdef inline DTYPE_t csr_dist(self, @@ -1007,23 +1005,23 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = d + fabs(x1_data[i1] - x2_data[i2]) + d = d + fabs((x1_data[i1]) - (x2_data[i2])) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + fabs(x1_data[i1]) + d = d + fabs(x1_data[i1]) i1 = i1 + 1 else: - d = d + fabs(x2_data[i2]) + d = d + fabs(x2_data[i2]) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = d + fabs(x2_data[i2]) + d = d + fabs(x2_data[i2]) i2 = i2 + 1 else: while i1 < x1_end: - d = d + fabs(x1_data[i1]) + d = d + fabs(x1_data[i1]) i1 = i1 + 1 return d @@ -1061,7 +1059,7 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d = fmax(d, fabs(x1[j] - x2[j])) + d = fmax(d, fabs( (x1[j]) - (x2[j]))) return d @@ -1094,23 +1092,23 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) + d = fmax(d, fabs((x1_data[i1]) - (x2_data[i2]))) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = fmax(d, fabs(x1_data[i1])) + d = fmax(d, fabs(x1_data[i1])) i1 = i1 + 1 else: - d = fmax(d, fabs(x2_data[i2])) + d = fmax(d, fabs(x2_data[i2])) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = fmax(d, fabs(x2_data[i2])) + d = fmax(d, fabs(x2_data[i2])) i2 = i2 + 1 else: while i1 < x1_end: - d = fmax(d, fabs(x1_data[i1])) + d = fmax(d, fabs(x1_data[i1])) i1 = i1 + 1 return d @@ -1179,10 +1177,12 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef bint has_w = self.size > 0 if has_w: for j in range(size): - d += (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p)) + d += (self.vec[j] * pow(fabs( + (x1[j]) - (x2[j]) + ), self.p)) else: for j in range(size): - d += (pow(fabs(x1[j] - x2[j]), self.p)) + d += (pow(fabs((x1[j]) - (x2[j])), self.p)) return d cdef inline DTYPE_t dist(self, @@ -1235,25 +1235,27 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = d + (self.vec[ix1] * pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) + d = d + (self.vec[ix1] * pow(fabs( + (x1_data[i1]) - (x2_data[i2]) + ), self.p)) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 else: - d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: ix2 = x2_indices[i2 % len_x2_indices] - d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: ix1 = x1_indices[i1 % len_x1_indices] - d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d @@ -1266,23 +1268,25 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = d + (pow(fabs(x1_data[i1] - x2_data[i2]), self.p)) + d = d + (pow(fabs( + (x1_data[i1]) - (x2_data[i2]) + ), self.p)) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + (pow(fabs(x1_data[i1]), self.p)) + d = d + (pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 else: - d = d + (pow(fabs(x2_data[i2]), self.p)) + d = d + (pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = d + (pow(fabs(x2_data[i2]), self.p)) + d = d + (pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: - d = d + (pow(fabs(x1_data[i1]), self.p)) + d = d + (pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d @@ -1366,7 +1370,9 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d += (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p)) + d += (pow(self.vec[j] * fabs( + (x1[j]) - (x2[j]) + ), self.p)) return d cdef inline DTYPE_t dist(self, @@ -1417,25 +1423,27 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = d + pow(self.vec[ix1] * fabs(x1_data[i1] - x2_data[i2]), self.p) + d = d + pow(self.vec[ix1] * fabs( + (x1_data[i1]) - (x2_data[i2]) + ), self.p) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 else: - d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: ix2 = x2_indices[i2 % len_x2_indices] - d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 else: while i1 < x1_end: ix1 = x1_indices[i1 % len_x1_indices] - d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 return d @@ -1514,7 +1522,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): # compute (x1 - x2).T * VI * (x1 - x2) for i in range(size): - self.vec[i] = x1[i] - x2[i] + self.vec[i] = (x1[i]) - (x2[i]) for i in range(size): tmp = 0 @@ -1571,7 +1579,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - self.vec[ix1] = x1_data[i1] - x2_data[i2] + self.vec[ix1] = (x1_data[i1]) - (x2_data[i2]) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: @@ -1721,9 +1729,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t denom, d = 0 cdef cnp.intp_t j for j in range(size): - denom = (fabs(x1[j]) + fabs(x2[j])) + denom = fabs((x1[j])) + fabs((x2[j])) if denom > 0: - d += (fabs(x1[j] - x2[j])) / denom + d += fabs((x1[j]) - (x2[j])) / denom return d cdef inline DTYPE_t csr_dist(self, @@ -1755,7 +1763,10 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d += fabs(x1_data[i1] - x2_data[i2]) / (fabs(x1_data[i1]) + fabs(x2_data[i2])) + d += ( + fabs((x1_data[i1]) - (x2_data[i2])) / + (fabs((x1_data[i1])) + fabs((x2_data[i2]))) + ) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: @@ -1796,8 +1807,8 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t num = 0, denom = 0 cdef cnp.intp_t j for j in range(size): - num += fabs(x1[j] - x2[j]) - denom += (fabs(x1[j]) + fabs(x2[j])) + num += fabs((x1[j]) - (x2[j])) + denom += (fabs(x1[j]) + fabs(x2[j])) if denom > 0: return num / denom else: @@ -1833,28 +1844,28 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - num += fabs(x1_data[i1] - x2_data[i2]) - denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) + num += fabs((x1_data[i1]) - (x2_data[i2])) + denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - num += fabs(x1_data[i1]) - denom += fabs(x1_data[i1]) + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) i1 = i1 + 1 else: - num += fabs(x2_data[i2]) - denom += fabs(x2_data[i2]) + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - num += fabs(x1_data[i1]) - denom += fabs(x1_data[i1]) + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) i2 = i2 + 1 else: while i1 < x1_end: - num += fabs(x2_data[i2]) - denom += fabs(x2_data[i2]) + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) i1 = i1 + 1 return num / denom @@ -2537,9 +2548,9 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: - cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) - cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) - return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) + cdef DTYPE_t sin_0 = sin(0.5 * ((x1[0]) - (x2[0]))) + cdef DTYPE_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1]))) + return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) cdef inline DTYPE_t dist(self, const {{INPUT_DTYPE_t}}* x1, From b3759fead28f064e519701786e0d0fd9522437bf Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 20 Jun 2022 12:34:23 +0200 Subject: [PATCH 18/28] Rename methods and correctly format their signatures Co-authored-by: Christian Lorentzen --- sklearn/metrics/_dist_metrics.pxd.tp | 24 ++- sklearn/metrics/_dist_metrics.pyx.tp | 189 ++++++++++++++------- sklearn/metrics/tests/test_dist_metrics.py | 4 +- 3 files changed, 141 insertions(+), 76 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 8ae0190e6c478..a12c089b089fc 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -85,19 +85,22 @@ cdef class DistanceMetric{{name_suffix}}: cdef object func cdef object kwargs - cdef DTYPE_t dist(self, + cdef DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1 - cdef DTYPE_t rdist(self, + cdef DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1 - cdef DTYPE_t csr_dist(self, + cdef DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -109,7 +112,8 @@ cdef class DistanceMetric{{name_suffix}}: const ITYPE_t size, ) nogil except -1 - cdef DTYPE_t csr_rdist(self, + cdef DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -121,18 +125,21 @@ cdef class DistanceMetric{{name_suffix}}: const ITYPE_t size, ) nogil except -1 - cdef int pdist(self, + cdef int pdist( + self, const {{INPUT_DTYPE_t}}[:, ::1] X, DTYPE_t[:, ::1] D, ) except -1 - cdef int cdist(self, + cdef int cdist( + self, const {{INPUT_DTYPE_t}}[:, ::1] X, const {{INPUT_DTYPE_t}}[:, ::1] Y, DTYPE_t[:, ::1] D, ) except -1 - cdef int csr_pdist(self, + cdef int pdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, @@ -140,7 +147,8 @@ cdef class DistanceMetric{{name_suffix}}: DTYPE_t[:, ::1] D, ) nogil except -1 - cdef int csr_cdist(self, + cdef int cdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 094ef38cb240d..e83254dec8895 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -336,7 +336,8 @@ cdef class DistanceMetric{{name_suffix}}: """ return - cdef DTYPE_t dist(self, + cdef DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -347,7 +348,8 @@ cdef class DistanceMetric{{name_suffix}}: """ return -999 - cdef DTYPE_t rdist(self, + cdef DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -363,7 +365,8 @@ cdef class DistanceMetric{{name_suffix}}: """ return self.dist(x1, x2, size) - cdef int pdist(self, + cdef int pdist( + self, const {{INPUT_DTYPE_t}}[:, ::1] X, DTYPE_t[:, ::1] D, ) except -1: @@ -376,7 +379,8 @@ cdef class DistanceMetric{{name_suffix}}: return 0 - cdef int cdist(self, + cdef int cdist( + self, const {{INPUT_DTYPE_t}}[:, ::1] X, const {{INPUT_DTYPE_t}}[:, ::1] Y, DTYPE_t[:, ::1] D, @@ -390,7 +394,8 @@ cdef class DistanceMetric{{name_suffix}}: D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) return 0 - cdef DTYPE_t csr_dist(self, + cdef DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -417,7 +422,8 @@ cdef class DistanceMetric{{name_suffix}}: """ return -999 - cdef DTYPE_t csr_rdist(self, + cdef DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -446,7 +452,7 @@ cdef class DistanceMetric{{name_suffix}}: because it is currently known to slow down execution as it takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 """ - return self.csr_dist( + return self.dist_csr( x1_data, x1_indices, x2_data, @@ -458,7 +464,8 @@ cdef class DistanceMetric{{name_suffix}}: size, ) - cdef int csr_pdist(self, + cdef int pdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, @@ -467,7 +474,7 @@ cdef class DistanceMetric{{name_suffix}}: ) nogil except -1: """Pairwise distances between rows in CSR matrix X. - Note that this implementation is twice faster than csr_cdist(X, X) + Note that this implementation is twice faster than cdist_csr(X, X) because it leverages the symmetry of the problem. """ cdef: @@ -481,7 +488,7 @@ cdef class DistanceMetric{{name_suffix}}: for i2 in range(i1, n_x1): x2_start = x1_indptr[i2] x2_end = x1_indptr[i2 + 1] - D[i1, i2] = D[i2, i1] = self.csr_dist( + D[i1, i2] = D[i2, i1] = self.dist_csr( x1_data, x1_indices, x1_data, @@ -494,7 +501,8 @@ cdef class DistanceMetric{{name_suffix}}: ) return 0 - cdef int csr_cdist(self, + cdef int cdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const SPARSE_INDEX_TYPE_t[:] x1_indptr, @@ -519,7 +527,7 @@ cdef class DistanceMetric{{name_suffix}}: x2_start = x2_indptr[i2] x2_end = x2_indptr[i2 + 1] - D[i1, i2] = self.csr_dist( + D[i1, i2] = self.dist_csr( x1_data, x1_indices, x2_data, @@ -606,7 +614,7 @@ cdef class DistanceMetric{{name_suffix}}: if X is Y: Darr = np.empty((n_X, n_X), dtype=DTYPE, order='C') - self.csr_pdist( + self.pdist_csr( x1_data=X_data, x1_indices=X_indices, x1_indptr=X_indptr, @@ -621,7 +629,7 @@ cdef class DistanceMetric{{name_suffix}}: Y_indptr = np.asarray(Y_csr.indptr, dtype=SPARSE_INDEX_TYPE) Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') - self.csr_cdist( + self.cdist_csr( x1_data=X_data, x1_indices=X_indices, x1_indptr=X_indptr, @@ -659,7 +667,7 @@ cdef class DistanceMetric{{name_suffix}}: ) Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') - self.csr_cdist( + self.cdist_csr( x1_data=X_data, x1_indices=X_indices, x1_indptr=X_indptr, @@ -745,7 +753,8 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline DTYPE_t csr_rdist(self, + cdef inline DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -801,7 +810,8 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -812,7 +822,8 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: - return sqrt(self.csr_rdist( + return sqrt( + self.rdist_csr( x1_data, x1_indices, x2_data, @@ -842,7 +853,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('SEuclidean dist: size of V does not match') - cdef inline DTYPE_t rdist(self, + cdef inline DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -854,7 +866,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (tmp * tmp / self.vec[j]) return d - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -873,7 +886,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline DTYPE_t csr_rdist(self, + cdef inline DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -930,7 +944,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -941,7 +956,8 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: - return sqrt(self.csr_rdist( + return sqrt( + self.rdist_csr( x1_data, x1_indices, x2_data, @@ -965,7 +981,8 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = 1 - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -976,7 +993,8 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += fabs((x1[j]) - (x2[j])) return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1051,7 +1069,8 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = INF{{name_suffix}} - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1063,7 +1082,8 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1167,7 +1187,8 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): f"the number of features ({X.shape[1]}). " f"Currently len(w)={self.size}.") - cdef inline DTYPE_t rdist(self, + cdef inline DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1185,7 +1206,8 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (pow(fabs((x1[j]) - (x2[j])), self.p)) return d - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1204,7 +1226,8 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** self.p - cdef inline DTYPE_t csr_rdist(self, + cdef inline DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1291,7 +1314,8 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1303,7 +1327,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const ITYPE_t size, ) nogil except -1: return pow( - self.csr_rdist( + self.rdist_csr( x1_data, x1_indices, x2_data, @@ -1361,7 +1385,8 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise ValueError('WMinkowskiDistance dist: ' 'size of w does not match') - cdef inline DTYPE_t rdist(self, + cdef inline DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1375,7 +1400,8 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ), self.p)) return d - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1394,7 +1420,8 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** self.p - cdef inline DTYPE_t csr_rdist(self, + cdef inline DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1448,7 +1475,8 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1460,7 +1488,7 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const ITYPE_t size, ) nogil except -1: return pow( - self.csr_rdist( + self.rdist_csr( x1_data, x1_indices, x2_data, @@ -1512,7 +1540,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('Mahalanobis dist: size of V does not match') - cdef inline DTYPE_t rdist(self, + cdef inline DTYPE_t rdist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1531,7 +1560,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += tmp * self.vec[i] return d - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1550,7 +1580,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline DTYPE_t csr_rdist(self, + cdef inline DTYPE_t rdist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1608,7 +1639,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1619,7 +1651,8 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const SPARSE_INDEX_TYPE_t x2_end, const ITYPE_t size, ) nogil except -1: - return sqrt(self.csr_rdist( + return sqrt( + self.rdist_csr( x1_data, x1_indices, x2_data, @@ -1643,7 +1676,8 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1656,7 +1690,8 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return float(n_unequal) / size - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1721,7 +1756,8 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1734,7 +1770,8 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += fabs((x1[j]) - (x2[j])) / denom return d - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1799,7 +1836,8 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1814,7 +1852,8 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): else: return 0.0 - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1882,7 +1921,8 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1901,7 +1941,8 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return 0 return (nnz - n_eq) * 1.0 / nnz - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -1974,7 +2015,8 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / N """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -1987,7 +2029,8 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return n_neq * 1. / size - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2052,7 +2095,8 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2066,7 +2110,8 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return n_neq / (2.0 * n_tt + n_neq) - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2136,7 +2181,8 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 1 - N_TT / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2150,7 +2196,8 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return (n_neq - n_tt + size) * 1.0 / (n_neq + size) - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2218,7 +2265,8 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2231,7 +2279,8 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2298,7 +2347,8 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N - N_TT) / N """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2311,7 +2361,8 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return (size - n_tt) * 1. / size - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2371,7 +2422,8 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2384,7 +2436,8 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2451,7 +2504,8 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT / 2 + N_FT + N_TF) """ - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, @@ -2465,7 +2519,8 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return n_neq / (0.5 * n_tt + n_neq) - cdef inline DTYPE_t csr_dist(self, + cdef inline DTYPE_t dist_csr( + self, const {{INPUT_DTYPE_t}}[:] x1_data, const SPARSE_INDEX_TYPE_t[:] x1_indices, const {{INPUT_DTYPE_t}}[:] x2_data, @@ -2595,14 +2650,16 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The # only way to be back compatible is to inherit `dist` from the base class # without GIL and called an inline `_dist` which acquire GIL. - cdef inline DTYPE_t dist(self, + cdef inline DTYPE_t dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: return self._dist(x1, x2, size) - cdef inline DTYPE_t _dist(self, + cdef inline DTYPE_t _dist( + self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 68b4f36a7d1c9..714f200397532 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -105,8 +105,8 @@ def test_cdist(metric_param_grid, X, Y): dm = DistanceMetricInterface.get_metric(metric, **kwargs) - # DistanceMetric.pairwise must be consistent - # on all combinations of format in {sparse, dense}². + # DistanceMetric.pairwise must be consistent for all + # combinations of formats in {sparse, dense}. D_sklearn = dm.pairwise(X, Y) assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) From 7f89236a58604597f08281eecd4013f9a6798aeb Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 20 Jun 2022 12:44:46 +0200 Subject: [PATCH 19/28] fixup! TST Remove xfail for Jaccard on 32bit arch. --- sklearn/metrics/_dist_metrics.pyx.tp | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index e83254dec8895..f36a0701c32f9 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -1198,12 +1198,12 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef bint has_w = self.size > 0 if has_w: for j in range(size): - d += (self.vec[j] * pow(fabs( + d += (self.vec[j] * pow(fabs( (x1[j]) - (x2[j]) ), self.p)) else: for j in range(size): - d += (pow(fabs((x1[j]) - (x2[j])), self.p)) + d += (pow(fabs((x1[j]) - (x2[j])), self.p)) return d cdef inline DTYPE_t dist( @@ -1212,13 +1212,13 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: - return pow(self.rdist(x1, x2, size), 1. / self.p) + return pow(self.rdist(x1, x2, size), 1. / self.p) cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: - return pow(rdist, 1. / self.p) + return pow( rdist, 1. / self.p) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - return pow(dist, self.p) + return pow( dist, self.p) def rdist_to_dist(self, rdist): return rdist ** (1. / self.p) @@ -1395,9 +1395,7 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d += (pow(self.vec[j] * fabs( - (x1[j]) - (x2[j]) - ), self.p)) + d += (pow(self.vec[j] * fabs((x1[j]) - (x2[j])), self.p)) return d cdef inline DTYPE_t dist( @@ -1406,13 +1404,13 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: - return pow(self.rdist(x1, x2, size), 1. / self.p) + return pow(self.rdist(x1, x2, size), 1. / self.p) cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: - return pow(rdist, 1. / self.p) + return pow( rdist, 1. / self.p) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - return pow(dist, self.p) + return pow( dist, self.p) def rdist_to_dist(self, rdist): return rdist ** (1. / self.p) @@ -1846,7 +1844,7 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef cnp.intp_t j for j in range(size): num += fabs((x1[j]) - (x2[j])) - denom += (fabs(x1[j]) + fabs(x2[j])) + denom += (fabs(x1[j]) + fabs(x2[j])) if denom > 0: return num / denom else: @@ -2618,7 +2616,7 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return 2 * asin(sqrt(rdist)) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - cdef DTYPE_t tmp = sin(0.5 * dist) + cdef DTYPE_t tmp = sin(0.5 * dist) return tmp * tmp def rdist_to_dist(self, rdist): From 01a0c3311cc566ba83855ac0743bcc4c2ae56633 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 20 Jun 2022 15:42:31 +0200 Subject: [PATCH 20/28] FEA CSR support for HaversineDistance --- sklearn/metrics/_dist_metrics.pyx.tp | 99 ++++++++++++++++++++++ sklearn/metrics/tests/test_dist_metrics.py | 43 +++++++--- 2 files changed, 132 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index f36a0701c32f9..1fd5dc7dfdeed 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -2626,6 +2626,105 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): tmp = np.sin(0.5 * dist) return tmp * tmp + cdef inline DTYPE_t dist_csr( + self, + const {{INPUT_DTYPE_t}}[:] x1_data, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const {{INPUT_DTYPE_t}}[:] x2_data, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, + const ITYPE_t size, + ) nogil except -1: + return 2 * asin(sqrt(self.rdist_csr( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ))) + + cdef inline DTYPE_t rdist_csr( + self, + const {{INPUT_DTYPE_t}}[:] x1_data, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const {{INPUT_DTYPE_t}}[:] x2_data, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + const SPARSE_INDEX_TYPE_t x1_start, + const SPARSE_INDEX_TYPE_t x1_end, + const SPARSE_INDEX_TYPE_t x2_start, + const SPARSE_INDEX_TYPE_t x2_end, + const ITYPE_t size, + ) nogil except -1: + + cdef: + cnp.npy_intp ix1, ix2 + cnp.npy_intp i1 = x1_start + cnp.npy_intp i2 = x2_start + cnp.npy_intp len_x1_indices = x1_indices.shape[0] + cnp.npy_intp len_x2_indices = x2_indices.shape[0] + + DTYPE_t x1_0 = 0 + DTYPE_t x1_1 = 0 + DTYPE_t x2_0 = 0 + DTYPE_t x2_1 = 0 + DTYPE_t sin_0 + DTYPE_t sin_1 + + while i1 < x1_end and i2 < x2_end: + # Use the modulo-trick to implement support for CSR × dense array + # with the CSR × CSR routine. See _pairwise_sparse_dense for more + # details. + ix1 = x1_indices[i1 % len_x1_indices] + ix2 = x2_indices[i2 % len_x2_indices] + + # Find the components in the 2D vectors to work with + x1_component = ix1 if (x1_start == 0) else ix1 % x1_start + x2_component = ix2 if (x2_start == 0) else ix2 % x2_start + + if x1_component == 0: + x1_0 = x1_data[i1] + else: + x1_1 = x1_data[i1] + + if x2_component == 0: + x2_0 = x2_data[i2] + else: + x2_1 = x2_data[i2] + + i1 = i1 + 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2 % len_x2_indices] + x2_component = ix2 if (x2_start == 0) else ix2 % x2_start + if x2_component == 0: + x2_0 = x2_data[i2] + else: + x2_1 = x2_data[i2] + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1 % len_x1_indices] + x1_component = ix1 if (x1_start == 0) else ix1 % x1_start + if x1_component == 0: + x1_0 = x1_data[i1] + else: + x1_1 = x1_data[i1] + i1 = i1 + 1 + + sin_0 = sin(0.5 * (x1_0 - x2_0)) + sin_1 = sin(0.5 * (x1_1 - x2_1)) + + return (sin_0 * sin_0 + cos(x1_0) * cos(x2_0) * sin_1 * sin_1) + #------------------------------------------------------------ # User-defined distance # diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 714f200397532..eb4cd384a966f 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -266,7 +266,20 @@ def test_pickle_bool_metrics(metric, X_bool): assert_allclose(D1, D2) -def test_haversine_metric(): +@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)]) +def test_haversine_metric(X, Y): + DistanceMetricInterface = ( + DistanceMetric if X.dtype == np.float64 else DistanceMetric32 + ) + + # The Haversine DistanceMetric only works on 2 features. + X = np.asarray(X[:, :2]) + Y = np.asarray(Y[:, :2]) + + X_csr, Y_csr = sp.csr_matrix(X), sp.csr_matrix(Y) + + # Haversine is not supported by scipy.special.distance.{cdist,pdist} + # So we reimplement it to have a reference. def haversine_slow(x1, x2): return 2 * np.arcsin( np.sqrt( @@ -275,18 +288,28 @@ def haversine_slow(x1, x2): ) ) - X = np.random.random((10, 2)) + D_reference = np.zeros((X_csr.shape[0], Y_csr.shape[0])) + for i, xi in enumerate(X): + for j, yj in enumerate(Y): + D_reference[i, j] = haversine_slow(xi, yj) - haversine = DistanceMetric.get_metric("haversine") + haversine = DistanceMetricInterface.get_metric("haversine") - D1 = haversine.pairwise(X) - D2 = np.zeros_like(D1) - for i, x1 in enumerate(X): - for j, x2 in enumerate(X): - D2[i, j] = haversine_slow(x1, x2) + D_sklearn = haversine.pairwise(X, Y) + assert_allclose( + haversine.dist_to_rdist(D_sklearn), np.sin(0.5 * D_reference) ** 2, rtol=1e-6 + ) - assert_allclose(D1, D2) - assert_allclose(haversine.dist_to_rdist(D1), np.sin(0.5 * D2) ** 2) + assert_allclose(D_sklearn, D_reference) + + D_sklearn = haversine.pairwise(X_csr, Y_csr) + assert_allclose(D_sklearn, D_reference) + + D_sklearn = haversine.pairwise(X_csr, Y) + assert_allclose(D_sklearn, D_reference) + + D_sklearn = haversine.pairwise(X, Y_csr) + assert_allclose(D_sklearn, D_reference) def test_pyfunc_metric(): From 7d8a7173c03ef11dac9788de5682c6b4b4efef6e Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 22 Jun 2022 10:44:29 +0200 Subject: [PATCH 21/28] Fix typo Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 1fd5dc7dfdeed..4670f98030e0a 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -409,7 +409,7 @@ cdef class DistanceMetric{{name_suffix}}: """Compute the distance between vectors x1 and x2 represented under the CSR format. - This must be overridden in a base class. + This must be overridden in a subclass. Notes ----- From 563e3590c4bb5b3e3a6de4f42bc2e6f56a4769dc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 22 Jun 2022 11:54:40 +0200 Subject: [PATCH 22/28] Do not upcast to 64bit yet keep the same precision --- sklearn/metrics/_dist_metrics.pyx.tp | 120 ++++++++++----------- sklearn/metrics/tests/test_dist_metrics.py | 8 +- 2 files changed, 65 insertions(+), 63 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 4670f98030e0a..2784a767cf0c9 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -784,7 +784,7 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - unsquared = ((x1_data[i1]) - (x2_data[i2])) + unsquared = x1_data[i1] - x2_data[i2] d = d + (unsquared * unsquared) i1 = i1 + 1 i2 = i2 + 1 @@ -862,7 +862,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t tmp, d=0 cdef cnp.intp_t j for j in range(size): - tmp = ((x1[j]) - (x2[j])) + tmp = x1[j] - x2[j] d += (tmp * tmp / self.vec[j]) return d @@ -917,7 +917,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - unsquared = ((x1_data[i1]) - (x2_data[i2])) + unsquared = x1_data[i1] - x2_data[i2] d = d + (unsquared * unsquared) / self.vec[ix1] i1 = i1 + 1 i2 = i2 + 1 @@ -990,7 +990,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d += fabs((x1[j]) - (x2[j])) + d += fabs(x1[j] - x2[j]) return d cdef inline DTYPE_t dist_csr( @@ -1023,23 +1023,23 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = d + fabs((x1_data[i1]) - (x2_data[i2])) + d = d + fabs(x1_data[i1] - x2_data[i2]) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + fabs(x1_data[i1]) + d = d + fabs(x1_data[i1]) i1 = i1 + 1 else: - d = d + fabs(x2_data[i2]) + d = d + fabs(x2_data[i2]) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = d + fabs(x2_data[i2]) + d = d + fabs(x2_data[i2]) i2 = i2 + 1 else: while i1 < x1_end: - d = d + fabs(x1_data[i1]) + d = d + fabs(x1_data[i1]) i1 = i1 + 1 return d @@ -1078,7 +1078,7 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d = fmax(d, fabs( (x1[j]) - (x2[j]))) + d = fmax(d, fabs(x1[j] - x2[j])) return d @@ -1112,23 +1112,23 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - d = fmax(d, fabs((x1_data[i1]) - (x2_data[i2]))) + d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = fmax(d, fabs(x1_data[i1])) + d = fmax(d, fabs(x1_data[i1])) i1 = i1 + 1 else: - d = fmax(d, fabs(x2_data[i2])) + d = fmax(d, fabs(x2_data[i2])) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = fmax(d, fabs(x2_data[i2])) + d = fmax(d, fabs(x2_data[i2])) i2 = i2 + 1 else: while i1 < x1_end: - d = fmax(d, fabs(x1_data[i1])) + d = fmax(d, fabs(x1_data[i1])) i1 = i1 + 1 return d @@ -1198,12 +1198,10 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef bint has_w = self.size > 0 if has_w: for j in range(size): - d += (self.vec[j] * pow(fabs( - (x1[j]) - (x2[j]) - ), self.p)) + d += (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p)) else: for j in range(size): - d += (pow(fabs((x1[j]) - (x2[j])), self.p)) + d += (pow(fabs(x1[j] - x2[j]), self.p)) return d cdef inline DTYPE_t dist( @@ -1215,10 +1213,10 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return pow(self.rdist(x1, x2, size), 1. / self.p) cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: - return pow( rdist, 1. / self.p) + return pow(rdist, 1. / self.p) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - return pow( dist, self.p) + return pow(dist, self.p) def rdist_to_dist(self, rdist): return rdist ** (1. / self.p) @@ -1259,26 +1257,26 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if ix1 == ix2: d = d + (self.vec[ix1] * pow(fabs( - (x1_data[i1]) - (x2_data[i2]) + x1_data[i1] - x2_data[i2] ), self.p)) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 else: - d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: ix2 = x2_indices[i2 % len_x2_indices] - d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: ix1 = x1_indices[i1 % len_x1_indices] - d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d @@ -1292,24 +1290,24 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if ix1 == ix2: d = d + (pow(fabs( - (x1_data[i1]) - (x2_data[i2]) + x1_data[i1] - x2_data[i2] ), self.p)) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + (pow(fabs(x1_data[i1]), self.p)) + d = d + (pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 else: - d = d + (pow(fabs(x2_data[i2]), self.p)) + d = d + (pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - d = d + (pow(fabs(x2_data[i2]), self.p)) + d = d + (pow(fabs(x2_data[i2]), self.p)) i2 = i2 + 1 else: while i1 < x1_end: - d = d + (pow(fabs(x1_data[i1]), self.p)) + d = d + (pow(fabs(x1_data[i1]), self.p)) i1 = i1 + 1 return d @@ -1395,7 +1393,7 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t d = 0 cdef cnp.intp_t j for j in range(size): - d += (pow(self.vec[j] * fabs((x1[j]) - (x2[j])), self.p)) + d += (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p)) return d cdef inline DTYPE_t dist( @@ -1407,10 +1405,10 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return pow(self.rdist(x1, x2, size), 1. / self.p) cdef inline DTYPE_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) nogil except -1: - return pow( rdist, 1. / self.p) + return pow(rdist, 1. / self.p) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - return pow( dist, self.p) + return pow(dist, self.p) def rdist_to_dist(self, rdist): return rdist ** (1. / self.p) @@ -1449,26 +1447,26 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if ix1 == ix2: d = d + pow(self.vec[ix1] * fabs( - (x1_data[i1]) - (x2_data[i2]) + x1_data[i1] - x2_data[i2] ), self.p) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 else: - d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: ix2 = x2_indices[i2 % len_x2_indices] - d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) + d = d + pow(self.vec[ix2] * fabs(x2_data[i2]), self.p) i2 = i2 + 1 else: while i1 < x1_end: ix1 = x1_indices[i1 % len_x1_indices] - d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) + d = d + pow(self.vec[ix1] * fabs(x1_data[i1]), self.p) i1 = i1 + 1 return d @@ -1549,7 +1547,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): # compute (x1 - x2).T * VI * (x1 - x2) for i in range(size): - self.vec[i] = (x1[i]) - (x2[i]) + self.vec[i] = x1[i] - x2[i] for i in range(size): tmp = 0 @@ -1608,7 +1606,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - self.vec[ix1] = (x1_data[i1]) - (x2_data[i2]) + self.vec[ix1] = x1_data[i1] - x2_data[i2] i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: @@ -1763,9 +1761,9 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t denom, d = 0 cdef cnp.intp_t j for j in range(size): - denom = fabs((x1[j])) + fabs((x2[j])) + denom = fabs(x1[j]) + fabs(x2[j]) if denom > 0: - d += fabs((x1[j]) - (x2[j])) / denom + d += fabs(x1[j] - x2[j]) / denom return d cdef inline DTYPE_t dist_csr( @@ -1799,8 +1797,8 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if ix1 == ix2: d += ( - fabs((x1_data[i1]) - (x2_data[i2])) / - (fabs((x1_data[i1])) + fabs((x2_data[i2]))) + fabs(x1_data[i1] - x2_data[i2]) / + (fabs(x1_data[i1]) + fabs(x2_data[i2])) ) i1 = i1 + 1 i2 = i2 + 1 @@ -1843,8 +1841,8 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef DTYPE_t num = 0, denom = 0 cdef cnp.intp_t j for j in range(size): - num += fabs((x1[j]) - (x2[j])) - denom += (fabs(x1[j]) + fabs(x2[j])) + num += fabs(x1[j] - x2[j]) + denom += fabs(x1[j]) + fabs(x2[j]) if denom > 0: return num / denom else: @@ -1881,28 +1879,28 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ix2 = x2_indices[i2 % len_x2_indices] if ix1 == ix2: - num += fabs((x1_data[i1]) - (x2_data[i2])) - denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) + num += fabs(x1_data[i1] - x2_data[i2]) + denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) i1 = i1 + 1 i2 = i2 + 1 elif ix1 < ix2: - num += fabs(x1_data[i1]) - denom += fabs(x1_data[i1]) + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) i1 = i1 + 1 else: - num += fabs(x2_data[i2]) - denom += fabs(x2_data[i2]) + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) i2 = i2 + 1 if i1 == x1_end: while i2 < x2_end: - num += fabs(x1_data[i1]) - denom += fabs(x1_data[i1]) + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) i2 = i2 + 1 else: while i1 < x1_end: - num += fabs(x2_data[i2]) - denom += fabs(x2_data[i2]) + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) i1 = i1 + 1 return num / denom @@ -2601,9 +2599,9 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): const {{INPUT_DTYPE_t}}* x2, ITYPE_t size, ) nogil except -1: - cdef DTYPE_t sin_0 = sin(0.5 * ((x1[0]) - (x2[0]))) - cdef DTYPE_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1]))) - return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) + cdef DTYPE_t sin_0 = sin(0.5 * ((x1[0]) - (x2[0]))) + cdef DTYPE_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1]))) + return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) cdef inline DTYPE_t dist(self, const {{INPUT_DTYPE_t}}* x1, @@ -2616,7 +2614,7 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return 2 * asin(sqrt(rdist)) cdef inline DTYPE_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) nogil except -1: - cdef DTYPE_t tmp = sin(0.5 * dist) + cdef DTYPE_t tmp = sin(0.5 * dist) return tmp * tmp def rdist_to_dist(self, rdist): diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index eb4cd384a966f..66c7802a33f24 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -209,11 +209,15 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): D64 = dm64.pairwise(X64) D32 = dm32.pairwise(X32) - assert_allclose(D64, D32, rtol=1e-5) + + # The original rtol is 1e-4 in of presence of float32. + # We increase the tolerance to 1e-6 to be stricter. + rtol = 1e-6 + assert_allclose(D64, D32, rtol=rtol) D64 = dm64.pairwise(X64, Y64) D32 = dm32.pairwise(X32, Y32) - assert_allclose(D64, D32, rtol=1e-5) + assert_allclose(D64, D32, rtol=rtol) @pytest.mark.parametrize("metric", BOOL_METRICS) From f863a517307c6d4ab695e8fbf0d14a01a84cc2f8 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 22 Jun 2022 13:29:11 +0200 Subject: [PATCH 23/28] Do use the default rtol --- sklearn/metrics/tests/test_dist_metrics.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 66c7802a33f24..3a96ffb6909ef 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -209,15 +209,11 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): D64 = dm64.pairwise(X64) D32 = dm32.pairwise(X32) - - # The original rtol is 1e-4 in of presence of float32. - # We increase the tolerance to 1e-6 to be stricter. - rtol = 1e-6 - assert_allclose(D64, D32, rtol=rtol) + assert_allclose(D64, D32) D64 = dm64.pairwise(X64, Y64) D32 = dm32.pairwise(X32, Y32) - assert_allclose(D64, D32, rtol=rtol) + assert_allclose(D64, D32) @pytest.mark.parametrize("metric", BOOL_METRICS) From 5ba0fbeea2a5c12e8f888bc20b37a0403d0ab40b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 22 Jun 2022 13:52:42 +0200 Subject: [PATCH 24/28] Set rtol explicitly in test_distance_metrics_dtype_consistency --- sklearn/metrics/tests/test_dist_metrics.py | 23 ++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 3a96ffb6909ef..ebd5f702eac9b 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -198,10 +198,15 @@ def test_pdist(metric_param_grid, X): @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) def test_distance_metrics_dtype_consistency(metric_param_grid): - # DistanceMetric must return similar distances for - # both 64bit and 32bit data. + # DistanceMetric must return similar distances for both float32 and float64 + # input data. metric, param_grid = metric_param_grid keys = param_grid.keys() + + # Chose rtol to make sure that this test is robust to changes in the random + # seed in the module-level test data generation code. + rtol = 1e-5 + for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) dm64 = DistanceMetric.get_metric(metric, **kwargs) @@ -209,11 +214,21 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): D64 = dm64.pairwise(X64) D32 = dm32.pairwise(X32) - assert_allclose(D64, D32) + + # Both results are np.float64 dtype because the accumulation accross + # features is done in float64. However the input data and the element + # wise arithmetic operations are done in float32 so we can expect a + # small discrepancy. + assert D64.dtype == D32.dtype == np.float64 + + # assert_allclose introspects the dtype of the input arrays to decide + # which rtol value to use by default but in this case we know that D32 + # is not computed with the same precision so we set rtol manually. + assert_allclose(D64, D32, rtol=rtol) D64 = dm64.pairwise(X64, Y64) D32 = dm32.pairwise(X32, Y32) - assert_allclose(D64, D32) + assert_allclose(D64, D32, rtol=rtol) @pytest.mark.parametrize("metric", BOOL_METRICS) From 4f4583934c151051510ba7cead1256bb6a7dabe6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Jun 2022 11:32:58 +0200 Subject: [PATCH 25/28] Implement the sparse-dense and the dense-sparse case for c-contiguity Also do test for c-contiguity. --- sklearn/metrics/_dist_metrics.pyx.tp | 37 ++++++++++++++++++++-- sklearn/metrics/tests/test_dist_metrics.py | 16 +++++++++- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 2784a767cf0c9..a1ff483d6c702 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -679,6 +679,39 @@ cdef class DistanceMetric{{name_suffix}}: ) return Darr + def _pairwise_dense_sparse(self, X, Y): + # Same remark as in _pairwise_sparse_dense. We could + # have implemented this method using _pairwise_dense_sparse, + # but this would have come with an extra copy to ensure + # c-contiguity of the result. + + Y_csr = Y.tocsr() + n_Y, size = Y_csr.shape + Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}}) + Y_indices = np.asarray(Y_csr.indices, dtype=SPARSE_INDEX_TYPE) + Y_indptr = np.asarray(Y_csr.indptr, dtype=SPARSE_INDEX_TYPE) + + n_X, _ = X.shape + X_data = X.reshape(-1) + X_indices = np.arange(size, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.arange( + start=0, stop=size * (n_X + 1), step=size, dtype=SPARSE_INDEX_TYPE + ) + + Darr = np.empty((n_X, n_Y), dtype=DTYPE, order='C') + self.cdist_csr( + x1_data=X_data, + x1_indices=X_indices, + x1_indptr=X_indptr, + x2_data=Y_data, + x2_indices=Y_indices, + x2_indptr=Y_indptr, + size=size, + D=Darr, + ) + return Darr + + def pairwise(self, X, Y=None): """Compute the pairwise distances between X and Y @@ -711,9 +744,7 @@ cdef class DistanceMetric{{name_suffix}}: return self._pairwise_sparse_sparse(X, Y) if X_is_sparse and not Y_is_sparse: return self._pairwise_sparse_dense(X, Y) - if not X_is_sparse and Y_is_sparse: - # Swapping argument and transposing the result - return self._pairwise_sparse_dense(Y, X).T + return self._pairwise_dense_sparse(X, Y) #------------------------------------------------------------ # Euclidean Distance diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index ebd5f702eac9b..e11be4dab3e20 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -108,15 +108,19 @@ def test_cdist(metric_param_grid, X, Y): # DistanceMetric.pairwise must be consistent for all # combinations of formats in {sparse, dense}. D_sklearn = dm.pairwise(X, Y) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) D_sklearn = dm.pairwise(X_csr, Y_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) D_sklearn = dm.pairwise(X_csr, Y) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) D_sklearn = dm.pairwise(X, Y_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) @@ -136,15 +140,19 @@ def test_cdist_bool_metric(metric, X_bool, Y_bool): X_bool_csr, Y_bool_csr = sp.csr_matrix(X_bool), sp.csr_matrix(Y_bool) D_sklearn = dm.pairwise(X_bool, Y_bool) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist) D_sklearn = dm.pairwise(X_bool_csr, Y_bool_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist) D_sklearn = dm.pairwise(X_bool, Y_bool_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist) D_sklearn = dm.pairwise(X_bool_csr, Y_bool) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_cdist) @@ -185,12 +193,15 @@ def test_pdist(metric_param_grid, X): dm = DistanceMetricInterface.get_metric(metric, **kwargs) D_sklearn = dm.pairwise(X) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_pdist, **rtol_dict) D_sklearn_csr = dm.pairwise(X_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) D_sklearn_csr = dm.pairwise(X_csr, X_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) @@ -203,7 +214,7 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): metric, param_grid = metric_param_grid keys = param_grid.keys() - # Chose rtol to make sure that this test is robust to changes in the random + # Choose rtol to make sure that this test is robust to changes in the random # seed in the module-level test data generation code. rtol = 1e-5 @@ -318,12 +329,15 @@ def haversine_slow(x1, x2): assert_allclose(D_sklearn, D_reference) D_sklearn = haversine.pairwise(X_csr, Y_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_reference) D_sklearn = haversine.pairwise(X_csr, Y) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_reference) D_sklearn = haversine.pairwise(X, Y_csr) + assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_reference) From 3e3e8881d4bc1c6d11a9e473a1efd8c90bb5d1cc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Jun 2022 11:50:43 +0200 Subject: [PATCH 26/28] Add validation on X and Y, accepting CSR as inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- sklearn/metrics/_dist_metrics.pyx.tp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index a1ff483d6c702..98a77a61530d8 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -721,19 +721,25 @@ cdef class DistanceMetric{{name_suffix}}: Parameters ---------- - X : array-like - Array of shape (Nx, D), representing Nx points in D dimensions. - Y : array-like (optional) - Array of shape (Ny, D), representing Ny points in D dimensions. + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. If not specified, then Y=X. Returns ------- - dist : ndarray - The shape (Nx, Ny) array of pairwise distances between points in - X and Y. + dist : ndarray of shape (n_samples_X, n_samples_Y) + The distance matrix of pairwise distances between points in X and Y. """ Y = X if Y is None else Y + X_is_Y = X is Y + X = check_array(X, accept_sparse=['csr']) + + if X_is_Y: + Y = X + else: + Y = check_array(Y, accept_sparse=['csr']) X_is_sparse = issparse(X) Y_is_sparse = issparse(Y) From ddc49d5a6373b882492cdd93da6d6d9ef15cdd63 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Jun 2022 12:50:39 +0200 Subject: [PATCH 27/28] Remove left-overs Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx.tp | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 98a77a61530d8..3dd720454fdee 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -642,11 +642,10 @@ cdef class DistanceMetric{{name_suffix}}: return Darr def _pairwise_sparse_dense(self, X, Y): - X_csr = X.tocsr() - n_X, size = X_csr.shape - X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) - X_indices = np.asarray(X_csr.indices, dtype=SPARSE_INDEX_TYPE) - X_indptr = np.asarray(X_csr.indptr, dtype=SPARSE_INDEX_TYPE) + n_X, size = X.shape + X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) + X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) # To avoid introducing redundant implementations for the CSR × dense array # case, we wrap the dense array into a fake CSR datastructure and leverage @@ -684,12 +683,10 @@ cdef class DistanceMetric{{name_suffix}}: # have implemented this method using _pairwise_dense_sparse, # but this would have come with an extra copy to ensure # c-contiguity of the result. - - Y_csr = Y.tocsr() - n_Y, size = Y_csr.shape - Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}}) - Y_indices = np.asarray(Y_csr.indices, dtype=SPARSE_INDEX_TYPE) - Y_indptr = np.asarray(Y_csr.indptr, dtype=SPARSE_INDEX_TYPE) + n_Y, size = Y.shape + Y_data = np.asarray(Y.data, dtype={{INPUT_DTYPE}}) + Y_indices = np.asarray(Y.indices, dtype=SPARSE_INDEX_TYPE) + Y_indptr = np.asarray(Y.indptr, dtype=SPARSE_INDEX_TYPE) n_X, _ = X.shape X_data = X.reshape(-1) @@ -729,14 +726,12 @@ cdef class DistanceMetric{{name_suffix}}: Returns ------- - dist : ndarray of shape (n_samples_X, n_samples_Y) + dist : ndarray of shape (n_samples_X, n_samples_Y) The distance matrix of pairwise distances between points in X and Y. """ - Y = X if Y is None else Y - X_is_Y = X is Y X = check_array(X, accept_sparse=['csr']) - if X_is_Y: + if Y is None: Y = X else: Y = check_array(Y, accept_sparse=['csr']) From 731370a0fad51c50a7e95ba2a52da8c75721fd17 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 29 Jun 2022 11:14:13 +0200 Subject: [PATCH 28/28] DOC Motivate the signature for DistanceMetric.{dist_csr, rdist_csr} Co-authored-by: Christian Lorentzen --- sklearn/metrics/_dist_metrics.pyx.tp | 43 ++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index e1f30f3ac4e28..a7574bff86510 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -415,9 +415,41 @@ cdef class DistanceMetric{{name_suffix}}: The implementation of this method in subclasses must be robust to the presence of explicit zeros in the CSR representation. - All the parameters are passed as to not use memoryview slicing - because it is currently known to slow down execution as it - takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 + An alternative signature would be: + + cdef DTYPE_t dist_csr( + self, + const {{INPUT_DTYPE_t}}[:] x1_data, + const SPARSE_INDEX_TYPE_t[:] x1_indices, + const {{INPUT_DTYPE_t}}[:] x2_data, + const SPARSE_INDEX_TYPE_t[:] x2_indices, + ) nogil except -1: + + Where calles would use slicing on the original CSR data and indices + memoryview: + + x1_start = X1_csr.indices_ptr[i] + x1_end = X1_csr.indices_ptr[i+1] + x2_start = X2_csr.indices_ptr[j] + x2_end = X2_csr.indices_ptr[j+1] + + self.dist_csr( + x1_data[x1_start:x1_end], + x1_indices[x1_start:x1_end], + x2_data[x2_start:x2_end], + x2_indices[x2_start:x2_end], + ) + + Yet, slicing on memoryview slows down execution as it takes the GIL. + See: https://github.com/scikit-learn/scikit-learn/issues/17299 + + Hence, to avoid slicing the data and indices arrays of the sparse + matrices containing respectively x1 and x2 (namely x{1,2}_{data,indice}) + are passed as well as their indice pointers (namely x{1,2}_{start,end}). + + For reference about the CSR format, see section 3.4 of + Saad, Y. (2003), Iterative Methods for Sparse Linear Systems, SIAM. + https://www-users.cse.umn.edu/~saad/IterMethBook_2ndEd.pdf """ return -999 @@ -447,9 +479,8 @@ cdef class DistanceMetric{{name_suffix}}: The implementation of this method in subclasses must be robust to the presence of explicit zeros in the CSR representation. - All the parameters are passed as to not use memoryview slicing - because it is currently known to slow down execution as it - takes the GIL. See: https://github.com/scikit-learn/scikit-learn/issues/17299 + More information about the motives for this method signature is given + in the docstring of dist_csr. """ return self.dist_csr( x1_data,