From 8ae1cb67111f71a875c748942f0405d38860165e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 23 Feb 2022 16:01:06 +0100
Subject: [PATCH 01/26] MAINT Generate DistanceMetrics for 32bit vectors

---
 sklearn/metrics/_dist_metrics.pxd             |  87 ------
 sklearn/metrics/_dist_metrics.pxd.tp          |  99 +++++++
 ..._dist_metrics.pyx => _dist_metrics.pyx.tp} | 268 +++++++++---------
 sklearn/metrics/setup.py                      |   9 +
 4 files changed, 239 insertions(+), 224 deletions(-)
 delete mode 100644 sklearn/metrics/_dist_metrics.pxd
 create mode 100644 sklearn/metrics/_dist_metrics.pxd.tp
 rename sklearn/metrics/{_dist_metrics.pyx => _dist_metrics.pyx.tp} (85%)

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
deleted file mode 100644
index e7c2f2ea2f926..0000000000000
--- a/sklearn/metrics/_dist_metrics.pxd
+++ /dev/null
@@ -1,87 +0,0 @@
-cimport numpy as np
-from libc.math cimport sqrt, exp
-
-from ..utils._typedefs cimport DTYPE_t, ITYPE_t
-
-######################################################################
-# Inline distance functions
-#
-#  We use these for the default (euclidean) case so that they can be
-#  inlined.  This leads to faster computation for the most common case
-cdef inline DTYPE_t euclidean_dist(const DTYPE_t* x1, const DTYPE_t* x2,
-                                   ITYPE_t size) nogil except -1:
-    cdef DTYPE_t tmp, d=0
-    cdef np.intp_t j
-    for j in range(size):
-        tmp = x1[j] - x2[j]
-        d += tmp * tmp
-    return sqrt(d)
-
-
-cdef inline DTYPE_t euclidean_rdist(const DTYPE_t* x1, const DTYPE_t* x2,
-                                    ITYPE_t size) nogil except -1:
-    cdef DTYPE_t tmp, d=0
-    cdef np.intp_t j
-    for j in range(size):
-        tmp = x1[j] - x2[j]
-        d += tmp * tmp
-    return d
-
-
-cdef inline DTYPE_t euclidean_dist_to_rdist(const DTYPE_t dist) nogil except -1:
-    return dist * dist
-
-
-cdef inline DTYPE_t euclidean_rdist_to_dist(const DTYPE_t dist) nogil except -1:
-    return sqrt(dist)
-
-
-######################################################################
-# DistanceMetric base class
-cdef class DistanceMetric:
-    # The following attributes are required for a few of the subclasses.
-    # we must define them here so that cython's limited polymorphism will work.
-    # Because we don't expect to instantiate a lot of these objects, the
-    # extra memory overhead of this setup should not be an issue.
-    cdef DTYPE_t p
-    cdef DTYPE_t[::1] vec
-    cdef DTYPE_t[:, ::1] mat
-    cdef ITYPE_t size
-    cdef object func
-    cdef object kwargs
-
-    cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                      ITYPE_t size) nogil except -1
-
-    cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                       ITYPE_t size) nogil except -1
-
-    cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
-
-    cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y,
-                   DTYPE_t[:, ::1] D) except -1
-
-    cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1
-
-    cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1
-
-
-######################################################################
-# DatasetsPair base class
-cdef class DatasetsPair:
-    cdef DistanceMetric distance_metric
-
-    cdef ITYPE_t n_samples_X(self) nogil
-
-    cdef ITYPE_t n_samples_Y(self) nogil
-
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
-
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
-
-
-cdef class DenseDenseDatasetsPair(DatasetsPair):
-    cdef:
-        const DTYPE_t[:, ::1] X
-        const DTYPE_t[:, ::1] Y
-        ITYPE_t d
diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp
new file mode 100644
index 0000000000000..8435597c7a186
--- /dev/null
+++ b/sklearn/metrics/_dist_metrics.pxd.tp
@@ -0,0 +1,99 @@
+{{py:
+
+dtypes = [
+    ('', 'DTYPE_t'),
+    ('32', 'np.float32_t'),
+]
+
+}}
+cimport numpy as np
+from libc.math cimport sqrt, exp
+
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
+
+{{for name_suffix, DTYPE_t in dtypes}}
+
+######################################################################
+# Inline distance functions
+#
+#  We use these for the default (euclidean) case so that they can be
+#  inlined.  This leads to faster computation for the most common case
+cdef inline {{DTYPE_t}} euclidean_dist{{name_suffix}}(const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                                   ITYPE_t size) nogil except -1:
+    cdef {{DTYPE_t}} tmp, d=0
+    cdef np.intp_t j
+    for j in range(size):
+        tmp = x1[j] - x2[j]
+        d += tmp * tmp
+    return sqrt(d)
+
+
+cdef inline {{DTYPE_t}} euclidean_rdist{{name_suffix}}(const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                                    ITYPE_t size) nogil except -1:
+    cdef {{DTYPE_t}} tmp, d=0
+    cdef np.intp_t j
+    for j in range(size):
+        tmp = x1[j] - x2[j]
+        d += tmp * tmp
+    return d
+
+
+cdef inline {{DTYPE_t}} euclidean_dist_to_rdist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1:
+    return dist * dist
+
+
+cdef inline {{DTYPE_t}} euclidean_rdist_to_dist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1:
+    return sqrt(dist)
+
+
+######################################################################
+# DistanceMetric base class
+cdef class DistanceMetric{{name_suffix}}:
+    # The following attributes are required for a few of the subclasses.
+    # we must define them here so that cython's limited polymorphism will work.
+    # Because we don't expect to instantiate a lot of these objects, the
+    # extra memory overhead of this setup should not be an issue.
+    cdef {{DTYPE_t}} p
+    cdef {{DTYPE_t}}[::1] vec
+    cdef {{DTYPE_t}}[:, ::1] mat
+    cdef ITYPE_t size
+    cdef object func
+    cdef object kwargs
+
+    cdef {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                      ITYPE_t size) nogil except -1
+
+    cdef {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                       ITYPE_t size) nogil except -1
+
+    cdef int pdist(self, const {{DTYPE_t}}[:, ::1] X, {{DTYPE_t}}[:, ::1] D) except -1
+
+    cdef int cdist(self, const {{DTYPE_t}}[:, ::1] X, const {{DTYPE_t}}[:, ::1] Y,
+                   {{DTYPE_t}}[:, ::1] D) except -1
+
+    cdef {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1
+
+    cdef {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1
+
+
+######################################################################
+# DatasetsPair base class
+cdef class DatasetsPair{{name_suffix}}:
+    cdef DistanceMetric{{name_suffix}} distance_metric
+
+    cdef ITYPE_t n_samples_X(self) nogil
+
+    cdef ITYPE_t n_samples_Y(self) nogil
+
+    cdef {{DTYPE_t}} dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+    cdef {{DTYPE_t}} surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+
+cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    cdef:
+        const {{DTYPE_t}}[:, ::1] X
+        const {{DTYPE_t}}[:, ::1] Y
+        ITYPE_t d
+
+{{endfor}}
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx.tp
similarity index 85%
rename from sklearn/metrics/_dist_metrics.pyx
rename to sklearn/metrics/_dist_metrics.pyx.tp
index c442be0398980..3ecfd0f3155ff 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -1,3 +1,12 @@
+{{py:
+
+
+dtypes = [
+    ('', 'DTYPE_t'),
+    ('32', 'np.float32_t'),
+]
+
+}}
 # By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
 # written for the scikit-learn project
 # License: BSD
@@ -14,26 +23,16 @@ cdef extern from "arrayobject.h":
     object PyArray_SimpleNewFromData(int nd, np.npy_intp* dims,
                                      int typenum, void* data)
 
-
-cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n):
-    # Wrap a memory buffer with an ndarray. Warning: this is not robust.
-    # In particular, if x is deallocated before the returned array goes
-    # out of scope, this could cause memory errors.  Since there is not
-    # a possibility of this for our use-case, this should be safe.
-
-    # Note: this Segfaults unless np.import_array() is called above
-    return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
-
-
-from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
-cdef DTYPE_t INF = np.inf
-
 from scipy.sparse import csr_matrix, issparse
 from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
 from ..utils._typedefs import DTYPE, ITYPE
 from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
 from ..utils import check_array
 
+cdef inline double fmax(double a, double b) nogil:
+    return max(a, b)
+
+
 ######################################################################
 # newObj function
 #  this is a helper function for pickling
@@ -95,9 +94,25 @@ def get_valid_metric_ids(L):
             if (val.__name__ in L) or (val in L)]
 
 
+{{for name_suffix, DTYPE_t in dtypes}}
+
+cdef inline np.ndarray _buffer_to_ndarray{{name_suffix}}(const {{DTYPE_t}}* x, np.npy_intp n):
+    # Wrap a memory buffer with an ndarray. Warning: this is not robust.
+    # In particular, if x is deallocated before the returned array goes
+    # out of scope, this could cause memory errors.  Since there is not
+    # a possibility of this for our use-case, this should be safe.
+
+    # Note: this Segfaults unless np.import_array() is called above
+    return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
+
+
+from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
+cdef {{DTYPE_t}} INF{{name_suffix}} = np.inf
+
+
 ######################################################################
 # Distance Metric Classes
-cdef class DistanceMetric:
+cdef class DistanceMetric{{name_suffix}}:
     """DistanceMetric class
 
     This class provides a uniform interface to fast distance metric
@@ -297,7 +312,7 @@ cdef class DistanceMetric:
         """
         return
 
-    cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                       ITYPE_t size) nogil except -1:
         """Compute the distance between vectors x1 and x2
 
@@ -305,7 +320,7 @@ cdef class DistanceMetric:
         """
         return -999
 
-    cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                        ITYPE_t size) nogil except -1:
         """Compute the rank-preserving surrogate distance between vectors x1 and x2.
 
@@ -318,7 +333,7 @@ cdef class DistanceMetric:
         """
         return self.dist(x1, x2, size)
 
-    cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1:
+    cdef int pdist(self, const {{DTYPE_t}}[:, ::1] X, {{DTYPE_t}}[:, ::1] D) except -1:
         """compute the pairwise distances between points in X"""
         cdef ITYPE_t i1, i2
         for i1 in range(X.shape[0]):
@@ -327,8 +342,8 @@ cdef class DistanceMetric:
                 D[i2, i1] = D[i1, i2]
         return 0
 
-    cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y,
-                   DTYPE_t[:, ::1] D) except -1:
+    cdef int cdist(self, const {{DTYPE_t}}[:, ::1] X, const {{DTYPE_t}}[:, ::1] Y,
+                   {{DTYPE_t}}[:, ::1] D) except -1:
         """compute the cross-pairwise distances between arrays X and Y"""
         cdef ITYPE_t i1, i2
         if X.shape[1] != Y.shape[1]:
@@ -338,11 +353,11 @@ cdef class DistanceMetric:
                 D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1])
         return 0
 
-    cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
+    cdef {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         """Convert the rank-preserving surrogate distance to the distance"""
         return rdist
 
-    cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
+    cdef {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
         """Convert the distance to the rank-preserving surrogate distance"""
         return dist
 
@@ -407,9 +422,9 @@ cdef class DistanceMetric:
             The shape (Nx, Ny) array of pairwise distances between points in
             X and Y.
         """
-        cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Xarr
-        cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Yarr
-        cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Darr
+        cdef np.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Xarr
+        cdef np.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Yarr
+        cdef np.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Darr
 
         Xarr = np.asarray(X, dtype=DTYPE, order='C')
         self._validate_data(Xarr)
@@ -429,7 +444,7 @@ cdef class DistanceMetric:
 #------------------------------------------------------------
 # Euclidean Distance
 #  d = sqrt(sum(x_i^2 - y_i^2))
-cdef class EuclideanDistance(DistanceMetric):
+cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Euclidean Distance metric
 
     .. math::
@@ -438,18 +453,18 @@ cdef class EuclideanDistance(DistanceMetric):
     def __init__(self):
         self.p = 2
 
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        return euclidean_dist(x1, x2, size)
+        return euclidean_dist{{name_suffix}}(x1, x2, size)
 
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        return euclidean_rdist(x1, x2, size)
+        return euclidean_rdist{{name_suffix}}(x1, x2, size)
 
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
+    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         return sqrt(rdist)
 
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
+    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
         return dist * dist
 
     def rdist_to_dist(self, rdist):
@@ -462,7 +477,7 @@ cdef class EuclideanDistance(DistanceMetric):
 #------------------------------------------------------------
 # SEuclidean Distance
 #  d = sqrt(sum((x_i - y_i2)^2 / v_i))
-cdef class SEuclideanDistance(DistanceMetric):
+cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Standardized Euclidean Distance metric
 
     .. math::
@@ -477,23 +492,23 @@ cdef class SEuclideanDistance(DistanceMetric):
         if X.shape[1] != self.size:
             raise ValueError('SEuclidean dist: size of V does not match')
 
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        cdef DTYPE_t tmp, d=0
+        cdef {{DTYPE_t}} tmp, d=0
         cdef np.intp_t j
         for j in range(size):
             tmp = x1[j] - x2[j]
             d += tmp * tmp / self.vec[j]
         return d
 
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return sqrt(self.rdist(x1, x2, size))
 
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
+    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         return sqrt(rdist)
 
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
+    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
         return dist * dist
 
     def rdist_to_dist(self, rdist):
@@ -506,7 +521,7 @@ cdef class SEuclideanDistance(DistanceMetric):
 #------------------------------------------------------------
 # Manhattan Distance
 #  d = sum(abs(x_i - y_i))
-cdef class ManhattanDistance(DistanceMetric):
+cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Manhattan/City-block Distance metric
 
     .. math::
@@ -515,9 +530,9 @@ cdef class ManhattanDistance(DistanceMetric):
     def __init__(self):
         self.p = 1
 
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d = 0
+        cdef {{DTYPE_t}} d = 0
         cdef np.intp_t j
         for j in range(size):
             d += fabs(x1[j] - x2[j])
@@ -527,7 +542,7 @@ cdef class ManhattanDistance(DistanceMetric):
 #------------------------------------------------------------
 # Chebyshev Distance
 #  d = max_i(abs(x_i - y_i))
-cdef class ChebyshevDistance(DistanceMetric):
+cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     """Chebyshev/Infinity Distance
 
     .. math::
@@ -546,11 +561,11 @@ cdef class ChebyshevDistance(DistanceMetric):
            [6.928..., 0....   ]])
     """
     def __init__(self):
-        self.p = INF
+        self.p = INF{{name_suffix}}
 
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d = 0
+        cdef {{DTYPE_t}} d = 0
         cdef np.intp_t j
         for j in range(size):
             d = fmax(d, fabs(x1[j] - x2[j]))
@@ -559,7 +574,7 @@ cdef class ChebyshevDistance(DistanceMetric):
 
 #------------------------------------------------------------
 # Minkowski Distance
-cdef class MinkowskiDistance(DistanceMetric):
+cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Minkowski Distance
 
     .. math::
@@ -610,9 +625,9 @@ cdef class MinkowskiDistance(DistanceMetric):
                              f"the number of features ({X.shape[1]}). "
                              f"Currently len(w)={self.size}.")
 
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d=0
+        cdef {{DTYPE_t}} d=0
         cdef np.intp_t j
         cdef bint has_w = self.size > 0
         if has_w:
@@ -623,14 +638,14 @@ cdef class MinkowskiDistance(DistanceMetric):
                 d += pow(fabs(x1[j] - x2[j]), self.p)
         return d
 
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return pow(self.rdist(x1, x2, size), 1. / self.p)
 
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
+    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         return pow(rdist, 1. / self.p)
 
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
+    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
         return pow(dist, self.p)
 
     def rdist_to_dist(self, rdist):
@@ -643,7 +658,7 @@ cdef class MinkowskiDistance(DistanceMetric):
 #------------------------------------------------------------
 # TODO: Remove in 1.3 - WMinkowskiDistance class
 # W-Minkowski Distance
-cdef class WMinkowskiDistance(DistanceMetric):
+cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Weighted Minkowski Distance
 
     .. math::
@@ -684,22 +699,22 @@ cdef class WMinkowskiDistance(DistanceMetric):
             raise ValueError('WMinkowskiDistance dist: '
                              'size of w does not match')
 
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d=0
+        cdef {{DTYPE_t}} d=0
         cdef np.intp_t j
         for j in range(size):
             d += pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p)
         return d
 
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return pow(self.rdist(x1, x2, size), 1. / self.p)
 
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
+    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         return pow(rdist, 1. / self.p)
 
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
+    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
         return pow(dist, self.p)
 
     def rdist_to_dist(self, rdist):
@@ -712,7 +727,7 @@ cdef class WMinkowskiDistance(DistanceMetric):
 #------------------------------------------------------------
 # Mahalanobis Distance
 #  d = sqrt( (x - y)^T V^-1 (x - y) )
-cdef class MahalanobisDistance(DistanceMetric):
+cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     """Mahalanobis Distance
 
     .. math::
@@ -747,9 +762,9 @@ cdef class MahalanobisDistance(DistanceMetric):
         if X.shape[1] != self.size:
             raise ValueError('Mahalanobis dist: size of V does not match')
 
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        cdef DTYPE_t tmp, d = 0
+        cdef {{DTYPE_t}} tmp, d = 0
         cdef np.intp_t i, j
 
         # compute (x1 - x2).T * VI * (x1 - x2)
@@ -763,14 +778,14 @@ cdef class MahalanobisDistance(DistanceMetric):
             d += tmp * self.vec[i]
         return d
 
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return sqrt(self.rdist(x1, x2, size))
 
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
+    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         return sqrt(rdist)
 
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
+    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
         return dist * dist
 
     def rdist_to_dist(self, rdist):
@@ -783,7 +798,7 @@ cdef class MahalanobisDistance(DistanceMetric):
 #------------------------------------------------------------
 # Hamming Distance
 #  d = N_unequal(x, y) / N_tot
-cdef class HammingDistance(DistanceMetric):
+cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Hamming Distance
 
     Hamming distance is meant for discrete-valued vectors, though it is
@@ -792,7 +807,7 @@ cdef class HammingDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int n_unequal = 0
         cdef np.intp_t j
@@ -805,7 +820,7 @@ cdef class HammingDistance(DistanceMetric):
 #------------------------------------------------------------
 # Canberra Distance
 #  D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ]
-cdef class CanberraDistance(DistanceMetric):
+cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Canberra Distance
 
     Canberra distance is meant for discrete-valued vectors, though it is
@@ -814,9 +829,9 @@ cdef class CanberraDistance(DistanceMetric):
     .. math::
        D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t denom, d = 0
+        cdef {{DTYPE_t}} denom, d = 0
         cdef np.intp_t j
         for j in range(size):
             denom = fabs(x1[j]) + fabs(x2[j])
@@ -828,7 +843,7 @@ cdef class CanberraDistance(DistanceMetric):
 #------------------------------------------------------------
 # Bray-Curtis Distance
 #  D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)]
-cdef class BrayCurtisDistance(DistanceMetric):
+cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Bray-Curtis Distance
 
     Bray-Curtis distance is meant for discrete-valued vectors, though it is
@@ -837,9 +852,9 @@ cdef class BrayCurtisDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t num = 0, denom = 0
+        cdef {{DTYPE_t}} num = 0, denom = 0
         cdef np.intp_t j
         for j in range(size):
             num += fabs(x1[j] - x2[j])
@@ -853,17 +868,15 @@ cdef class BrayCurtisDistance(DistanceMetric):
 #------------------------------------------------------------
 # Jaccard Distance (boolean)
 #  D(x, y) = N_unequal(x, y) / N_nonzero(x, y)
-cdef class JaccardDistance(DistanceMetric):
+cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Jaccard Distance
 
     Jaccard Distance is a dissimilarity measure for boolean-valued
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_eq = 0, nnz = 0
         cdef np.intp_t j
@@ -883,17 +896,15 @@ cdef class JaccardDistance(DistanceMetric):
 #------------------------------------------------------------
 # Matching Distance (boolean)
 #  D(x, y) = n_neq / n
-cdef class MatchingDistance(DistanceMetric):
+cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Matching Distance
 
     Matching Distance is a dissimilarity measure for boolean-valued
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{N_{TF} + N_{FT}}{N}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0
         cdef np.intp_t j
@@ -907,17 +918,15 @@ cdef class MatchingDistance(DistanceMetric):
 #------------------------------------------------------------
 # Dice Distance (boolean)
 #  D(x, y) = n_neq / (2 * ntt + n_neq)
-cdef class DiceDistance(DistanceMetric):
+cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Dice Distance
 
     Dice Distance is a dissimilarity measure for boolean-valued
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{N_{TF} + N_{FT}}{2 * N_{TT} + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0, ntt = 0
         cdef np.intp_t j
@@ -932,17 +941,15 @@ cdef class DiceDistance(DistanceMetric):
 #------------------------------------------------------------
 # Kulsinski Distance (boolean)
 #  D(x, y) = (ntf + nft - ntt + n) / (n_neq + n)
-cdef class KulsinskiDistance(DistanceMetric):
+cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Kulsinski Distance
 
     Kulsinski Distance is a dissimilarity measure for boolean-valued
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = 1 - \frac{N_{TT}}{N + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, ntt = 0, n_neq = 0
         cdef np.intp_t j
@@ -957,17 +964,15 @@ cdef class KulsinskiDistance(DistanceMetric):
 #------------------------------------------------------------
 # Rogers-Tanimoto Distance (boolean)
 #  D(x, y) = 2 * n_neq / (n + n_neq)
-cdef class RogersTanimotoDistance(DistanceMetric):
+cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Rogers-Tanimoto Distance
 
     Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0
         cdef np.intp_t j
@@ -981,17 +986,15 @@ cdef class RogersTanimotoDistance(DistanceMetric):
 #------------------------------------------------------------
 # Russell-Rao Distance (boolean)
 #  D(x, y) = (n - ntt) / n
-cdef class RussellRaoDistance(DistanceMetric):
+cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Russell-Rao Distance
 
     Russell-Rao Distance is a dissimilarity measure for boolean-valued
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{N - N_{TT}}{N}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, ntt = 0
         cdef np.intp_t j
@@ -1005,17 +1008,15 @@ cdef class RussellRaoDistance(DistanceMetric):
 #------------------------------------------------------------
 # Sokal-Michener Distance (boolean)
 #  D(x, y) = 2 * n_neq / (n + n_neq)
-cdef class SokalMichenerDistance(DistanceMetric):
+cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Sokal-Michener Distance
 
     Sokal-Michener Distance is a dissimilarity measure for boolean-valued
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0
         cdef np.intp_t j
@@ -1029,17 +1030,15 @@ cdef class SokalMichenerDistance(DistanceMetric):
 #------------------------------------------------------------
 # Sokal-Sneath Distance (boolean)
 #  D(x, y) = n_neq / (0.5 * n_tt + n_neq)
-cdef class SokalSneathDistance(DistanceMetric):
+cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Sokal-Sneath Distance
 
     Sokal-Sneath Distance is a dissimilarity measure for boolean-valued
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} / 2 + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, ntt = 0, n_neq = 0
         cdef np.intp_t j
@@ -1055,7 +1054,7 @@ cdef class SokalSneathDistance(DistanceMetric):
 # Haversine Distance (2 dimensional)
 #  D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2)
 #                          + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]}
-cdef class HaversineDistance(DistanceMetric):
+cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     """Haversine (Spherical) Distance
 
     The Haversine distance is the angular distance between two points on
@@ -1063,9 +1062,6 @@ cdef class HaversineDistance(DistanceMetric):
     to be the latitude, the second is the longitude, given in radians.
     The dimension of the points must be 2:
 
-    .. math::
-       D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)
-                                + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]
     """
 
     def _validate_data(self, X):
@@ -1073,21 +1069,21 @@ cdef class HaversineDistance(DistanceMetric):
             raise ValueError("Haversine distance only valid "
                              "in 2 dimensions")
 
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0]))
-        cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1]))
+        cdef {{DTYPE_t}} sin_0 = sin(0.5 * (x1[0] - x2[0]))
+        cdef {{DTYPE_t}} sin_1 = sin(0.5 * (x1[1] - x2[1]))
         return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
 
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return 2 * asin(sqrt(self.rdist(x1, x2, size)))
 
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
+    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         return 2 * asin(sqrt(rdist))
 
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        cdef DTYPE_t tmp = sin(0.5 * dist)
+    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        cdef {{DTYPE_t}} tmp = sin(0.5 * dist)
         return tmp * tmp
 
     def rdist_to_dist(self, rdist):
@@ -1104,7 +1100,7 @@ cdef class HaversineDistance(DistanceMetric):
 # [This is not a true metric, so we will leave it out.]
 #
 #cdef class YuleDistance(DistanceMetric):
-#    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+#    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
 #                             ITYPE_t size):
 #        cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0
 #        cdef np.intp_t j
@@ -1124,9 +1120,9 @@ cdef class HaversineDistance(DistanceMetric):
 # [This is not a true metric, so we will leave it out.]
 #
 #cdef class CosineDistance(DistanceMetric):
-#    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+#    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
 #                             ITYPE_t size):
-#        cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0
+#        cdef {{DTYPE_t}} d = 0, norm1 = 0, norm2 = 0
 #        cdef np.intp_t j
 #        for j in range(size):
 #            d += x1[j] * x2[j]
@@ -1140,11 +1136,11 @@ cdef class HaversineDistance(DistanceMetric):
 #  D(x, y) = dot((x - mx), (y - my)) / (|x - mx| * |y - my|)
 # [This is not a true metric, so we will leave it out.]
 #
-#cdef class CorrelationDistance(DistanceMetric):
-#    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+#cdef class CorrelationDistance(DistanceMetric{{name_suffix}}):
+#    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
 #                             ITYPE_t size):
-#        cdef DTYPE_t mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0
-#        cdef DTYPE_t tmp1, tmp2
+#        cdef {{DTYPE_t}} mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0
+#        cdef {{DTYPE_t}} tmp1, tmp2
 #
 #        cdef np.intp_t i
 #        for i in range(size):
@@ -1166,7 +1162,7 @@ cdef class HaversineDistance(DistanceMetric):
 #------------------------------------------------------------
 # User-defined distance
 #
-cdef class PyFuncDistance(DistanceMetric):
+cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     """PyFunc Distance
 
     A user-defined distance
@@ -1185,16 +1181,16 @@ cdef class PyFuncDistance(DistanceMetric):
     # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
     # only way to be back compatible is to inherit `dist` from the base class
     # without GIL and called an inline `_dist` which acquire GIL.
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return self._dist(x1, x2, size)
 
-    cdef inline DTYPE_t _dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+    cdef inline {{DTYPE_t}} _dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) except -1 with gil:
         cdef np.ndarray x1arr
         cdef np.ndarray x2arr
-        x1arr = _buffer_to_ndarray(x1, size)
-        x2arr = _buffer_to_ndarray(x2, size)
+        x1arr = _buffer_to_ndarray{{name_suffix}}(x1, size)
+        x2arr = _buffer_to_ndarray{{name_suffix}}(x2, size)
         d = self.func(x1arr, x2arr, **self.kwargs)
         try:
             # Cython generates code here that results in a TypeError
@@ -1205,13 +1201,9 @@ cdef class PyFuncDistance(DistanceMetric):
                             "vectors and return a float.")
 
 
-cdef inline double fmax(double a, double b) nogil:
-    return max(a, b)
-
-
 ######################################################################
 # Datasets Pair Classes
-cdef class DatasetsPair:
+cdef class DatasetsPair{{name_suffix}}:
     """Abstract class which wraps a pair of datasets (X, Y).
 
     This class allows computing distances between a single pair of rows of
@@ -1316,17 +1308,17 @@ cdef class DatasetsPair:
         # TODO: add "with gil: raise" here when supporting Cython 3.0
         return -999
 
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef {{DTYPE_t}} surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.dist(i, j)
 
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef {{DTYPE_t}} dist(self, ITYPE_t i, ITYPE_t j) nogil:
         # This is a abstract method.
         # This _must_ always be overwritten in subclasses.
         # TODO: add "with gil: raise" here when supporting Cython 3.0
         return -1
 
 @final
-cdef class DenseDenseDatasetsPair(DatasetsPair):
+cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
     """Compute distances between row vectors of two arrays.
 
     Parameters
@@ -1342,7 +1334,7 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         between two row vectors of (X, Y).
     """
 
-    def __init__(self, X, Y, DistanceMetric distance_metric):
+    def __init__(self, X, Y, DistanceMetric{{name_suffix}} distance_metric):
         super().__init__(distance_metric)
         # Arrays have already been checked
         self.X = X
@@ -1358,13 +1350,15 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         return self.Y.shape[0]
 
     @final
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef {{DTYPE_t}} surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.distance_metric.rdist(&self.X[i, 0],
                                           &self.Y[j, 0],
                                           self.d)
 
     @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef {{DTYPE_t}} dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.distance_metric.dist(&self.X[i, 0],
                                          &self.Y[j, 0],
                                          self.d)
+
+{{endfor}}
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 1c26d9969397c..f9203c7789979 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -3,6 +3,8 @@
 
 from numpy.distutils.misc_util import Configuration
 
+from sklearn._build_utils import gen_from_templates
+
 
 def configuration(parent_package="", top_path=None):
     config = Configuration("metrics", parent_package, top_path)
@@ -19,6 +21,13 @@ def configuration(parent_package="", top_path=None):
         "_pairwise_fast", sources=["_pairwise_fast.pyx"], libraries=libraries
     )
 
+    templates = [
+        "sklearn/metrics/_dist_metrics.pyx.tp",
+        "sklearn/metrics/_dist_metrics.pxd.tp",
+    ]
+
+    gen_from_templates(templates)
+
     config.add_extension(
         "_dist_metrics",
         sources=["_dist_metrics.pyx"],

From 487c0f131f29807e1b5046f6aeb1b16713ca976c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 23 Feb 2022 16:35:03 +0100
Subject: [PATCH 02/26] MAINT Generate PairwiseDistancesReduction for 32bit and
 64bit

Also populate the .gitignore with new files
---
 .gitignore                                    |   4 +
 ...x => _pairwise_distances_reduction.pyx.tp} | 139 ++++++++++--------
 sklearn/metrics/setup.py                      |   1 +
 3 files changed, 81 insertions(+), 63 deletions(-)
 rename sklearn/metrics/{_pairwise_distances_reduction.pyx => _pairwise_distances_reduction.pyx.tp} (90%)

diff --git a/.gitignore b/.gitignore
index d6ae51ec333f2..c3e28a9cfc170 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,3 +85,7 @@ sklearn/utils/_seq_dataset.pxd
 sklearn/utils/_weight_vector.pyx
 sklearn/utils/_weight_vector.pxd
 sklearn/linear_model/_sag_fast.pyx
+sklearn/metrics/_weight_vector.pyx
+sklearn/metrics/_dist_metrics.pyx
+sklearn/metrics/_dist_metrics.pxd
+sklearn/metrics/_pairwise_distances_reduction.pyx
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
similarity index 90%
rename from sklearn/metrics/_pairwise_distances_reduction.pyx
rename to sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index df0918bb61334..f8716cec1088c 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -1,3 +1,11 @@
+{{py:
+
+dtypes = [
+    ('', '64', 'DTYPE_t'),
+    ('32', '32', 'np.float32_t')
+]
+
+}}
 # Pairwise Distances Reductions
 # =============================
 #
@@ -23,7 +31,6 @@ from libc.float cimport DBL_MAX
 from cython cimport final
 from cython.parallel cimport parallel, prange
 
-from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair
 from ..utils._cython_blas cimport (
   BLAS_Order,
   BLAS_Trans,
@@ -47,11 +54,14 @@ from ..utils.fixes import threadpool_limits
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
-
 np.import_array()
 
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
-    const DTYPE_t[:, ::1] X,
+{{for distance_suffix, pdr_suffix, DTYPE_t in dtypes}}
+
+from ._dist_metrics cimport DatasetsPair{{distance_suffix}}, DenseDenseDatasetsPair{{distance_suffix}}
+
+cpdef {{DTYPE_t}}[::1] _sqeuclidean_row_norms{{pdr_suffix}}(
+    const {{DTYPE_t}}[:, ::1] X,
     ITYPE_t num_threads,
 ):
     """Compute the squared euclidean norm of the rows of X in parallel.
@@ -63,11 +73,11 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
         # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
         # const qualifier.
         # See: https://github.com/scipy/scipy/issues/14262
-        DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
+        {{DTYPE_t}} * X_ptr = <{{DTYPE_t}} *> &X[0, 0]
         ITYPE_t idx = 0
         ITYPE_t n = X.shape[0]
         ITYPE_t d = X.shape[1]
-        DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
+        {{DTYPE_t}}[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
 
     for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
         squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
@@ -75,8 +85,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
     return squared_row_norms
 
 
-
-cdef class PairwiseDistancesReduction:
+cdef class PairwiseDistancesReduction{{pdr_suffix}}:
     """Abstract base class for pairwise distance computation & reduction.
 
     Subclasses of this class compute pairwise distances between a set of
@@ -104,7 +113,7 @@ cdef class PairwiseDistancesReduction:
 
     Parameters
     ----------
-    datasets_pair: DatasetsPair
+    datasets_pair: DatasetsPair{{distance_suffix}}
         The pair of dataset to use.
 
     chunk_size: int, default=None
@@ -150,7 +159,7 @@ cdef class PairwiseDistancesReduction:
     """
 
     cdef:
-        readonly DatasetsPair datasets_pair
+        readonly DatasetsPair{{distance_suffix}} datasets_pair
 
         # The number of threads that can be used is stored in effective_n_threads.
         #
@@ -218,7 +227,7 @@ cdef class PairwiseDistancesReduction:
 
     def __init__(
         self,
-        DatasetsPair datasets_pair,
+        DatasetsPair{{distance_suffix}} datasets_pair,
         chunk_size=None,
         n_threads=None,
         strategy=None,
@@ -417,7 +426,7 @@ cdef class PairwiseDistancesReduction:
     ) nogil:
         """Compute the pairwise distances on two chunks of X and Y and reduce them.
 
-        This is THE core computational method of PairwiseDistanceReductions.
+        This is THE core computational method of PairwiseDistanceReductions{{pdr_suffix}}.
         This must be implemented in subclasses.
         """
         return
@@ -493,7 +502,7 @@ cdef class PairwiseDistancesReduction:
         """Update datastructures after executing all the reductions."""
         return
 
-cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
+cdef class PairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistancesReduction{{pdr_suffix}}):
     """Compute the argkmin of row vectors of X on the ones of Y.
 
     For each row vector of X, computes the indices of k first the rows
@@ -516,7 +525,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         The number of OpenMP threads to use for the reduction.
         Parallelism is done on chunks and the sharding of chunks
         depends on the `strategy` set on
-        :meth:`~PairwiseDistancesArgKmin.compute`.
+        :meth:`~PairwiseDistancesArgKmin{{distance_suffix}}.compute`.
 
         See _openmp_effective_n_threads, for details about
         the specification of n_threads.
@@ -529,10 +538,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         ITYPE_t k
 
         ITYPE_t[:, ::1] argkmin_indices
-        DTYPE_t[:, ::1] argkmin_distances
+        {{DTYPE_t}}[:, ::1] argkmin_distances
 
         # Used as array of pointers to private datastructures used in threads.
-        DTYPE_t ** heaps_r_distances_chunks
+        {{DTYPE_t}} ** heaps_r_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
     @classmethod
@@ -578,7 +587,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             The number of OpenMP threads to use for the reduction.
             Parallelism is done on chunks and the sharding of chunks
             depends on the `strategy` set on
-            :meth:`~PairwiseDistancesArgKmin.compute`.
+            :meth:`~PairwiseDistancesArgKmin{{pdr_suffix}}.compute`.
 
             See _openmp_effective_n_threads, for details about
             the specification of n_threads.
@@ -630,8 +639,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         Notes
         -----
             This public classmethod is responsible for introspecting the arguments
-            values to dispatch to the private :meth:`PairwiseDistancesArgKmin._compute`
-            instance method of the most appropriate :class:`PairwiseDistancesArgKmin`
+            values to dispatch to the private :meth:`PairwiseDistancesArgKmin{{pdr_suffix}}._compute`
+            instance method of the most appropriate :class:`PairwiseDistancesArgKmin{{pdr_suffix}}`
             concrete implementation.
 
             All temporarily allocated datastructures necessary for the concrete
@@ -655,7 +664,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             # at time to leverage a call to the BLAS GEMM routine as explained
             # in more details in the docstring.
             use_squared_distances = metric == "sqeuclidean"
-            pda = FastEuclideanPairwiseDistancesArgKmin(
+            pda = FastEuclideanPairwiseDistancesArgKmin{{pdr_suffix}}(
                 X=X, Y=Y, k=k,
                 use_squared_distances=use_squared_distances,
                 chunk_size=chunk_size,
@@ -665,8 +674,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         else:
              # Fall back on a generic implementation that handles most scipy
              # metrics by computing the distances between 2 vectors at a time.
-            pda = PairwiseDistancesArgKmin(
-                datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+            pda = PairwiseDistancesArgKmin{{pdr_suffix}}(
+                datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric, metric_kwargs),
                 k=k,
                 chunk_size=chunk_size,
                 strategy=strategy,
@@ -684,7 +693,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
     def __init__(
         self,
-        DatasetsPair datasets_pair,
+        DatasetsPair{{distance_suffix}} datasets_pair,
         chunk_size=None,
         n_threads=None,
         strategy=None,
@@ -707,14 +716,14 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         #   - when parallelizing on Y, the pointers of those heaps are referencing
         #   small heaps which are thread-wise-allocated and whose content will be
         #   merged with the main heaps'.
-        self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.chunks_n_threads
+        self.heaps_r_distances_chunks = <{{DTYPE_t}} **> malloc(
+            sizeof({{DTYPE_t}} *) * self.chunks_n_threads
         )
         self.heaps_indices_chunks = <ITYPE_t **> malloc(
             sizeof(ITYPE_t *) * self.chunks_n_threads
         )
 
-        # Main heaps which will be returned as results by `PairwiseDistancesArgKmin.compute`.
+        # Main heaps which will be returned as results by `PairwiseDistancesArgKmin{{pdr_suffix}}.compute`.
         self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
 
@@ -737,7 +746,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             ITYPE_t i, j
             ITYPE_t n_samples_X = X_end - X_start
             ITYPE_t n_samples_Y = Y_end - Y_start
-            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            {{DTYPE_t}} *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
         # Pushing the distances and their associated indices on a heap
@@ -800,8 +809,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             # As chunks of X are shared across threads, so must their
             # heaps. To solve this, each thread has its own heaps
             # which are then synchronised back in the main ones.
-            self.heaps_r_distances_chunks[thread_num] = <DTYPE_t *> malloc(
-                heaps_size * sizeof(DTYPE_t)
+            self.heaps_r_distances_chunks[thread_num] = <{{DTYPE_t}} *> malloc(
+                heaps_size * sizeof({{DTYPE_t}})
             )
             self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(
                 heaps_size * sizeof(ITYPE_t)
@@ -870,7 +879,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         cdef:
             ITYPE_t i, j
             ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
-            DTYPE_t[:, ::1] distances = self.argkmin_distances
+            {{DTYPE_t}}[:, ::1] distances = self.argkmin_distances
         for i in prange(self.n_samples_X, schedule='static', nogil=True,
                         num_threads=self.effective_n_threads):
             for j in range(self.k):
@@ -887,14 +896,14 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
             # Values are returned identically to the way `KNeighborsMixin.kneighbors`
             # returns values. This is counter-intuitive but this allows not using
-            # complex adaptations where `PairwiseDistancesArgKmin.compute` is called.
+            # complex adaptations where `PairwiseDistancesArgKmin{{pdr_suffix}}.compute` is called.
             return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
         return np.asarray(self.argkmin_indices)
 
 
-cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
-    """Fast specialized alternative for PairwiseDistancesArgKmin on EuclideanDistance.
+cdef class FastEuclideanPairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistancesArgKmin{{pdr_suffix}}):
+    """Fast specialized alternative for PairwiseDistancesArgKmin{{pdr_suffix}} on EuclideanDistance.
 
     The full pairwise squared distances matrix is computed as follows:
 
@@ -911,18 +920,18 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
     """
 
     cdef:
-        const DTYPE_t[:, ::1] X
-        const DTYPE_t[:, ::1] Y
-        const DTYPE_t[::1] X_norm_squared
-        const DTYPE_t[::1] Y_norm_squared
+        const {{DTYPE_t}}[:, ::1] X
+        const {{DTYPE_t}}[:, ::1] Y
+        const {{DTYPE_t}}[::1] X_norm_squared
+        const {{DTYPE_t}}[::1] Y_norm_squared
 
         # Buffers for GEMM
-        DTYPE_t ** dist_middle_terms_chunks
+        {{DTYPE_t}} ** dist_middle_terms_chunks
         bint use_squared_distances
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        return (PairwiseDistancesArgKmin.is_usable_for(X, Y, metric) and
+        return (PairwiseDistancesArgKmin{{pdr_suffix}}.is_usable_for(X, Y, metric) and
                 not _in_unstable_openblas_configuration())
 
     def __init__(
@@ -946,32 +955,34 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
 
         super().__init__(
             # The datasets pair here is used for exact distances computations
-            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+            datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric="euclidean"),
             chunk_size=chunk_size,
             n_threads=n_threads,
             strategy=strategy,
             k=k,
         )
-        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        # X and Y are checked by the DatasetsPair{{distance_suffix}} implemented as a DenseDenseDatasetsPair{{distance_suffix}}
         cdef:
-            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
+            DenseDenseDatasetsPair{{distance_suffix}} datasets_pair = (
+            <DenseDenseDatasetsPair{{distance_suffix}}> self.datasets_pair
+        )
         self.X, self.Y = datasets_pair.X, datasets_pair.Y
 
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared")
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.effective_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms{{pdr_suffix}}(self.Y, self.effective_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms(self.X, self.effective_n_threads)
+            _sqeuclidean_row_norms{{pdr_suffix}}(self.X, self.effective_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 
         # Temporary datastructures used in threads
-        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.chunks_n_threads
+        self.dist_middle_terms_chunks = <{{DTYPE_t}} **> malloc(
+            sizeof({{DTYPE_t}} *) * self.chunks_n_threads
         )
 
     def __dealloc__(self):
@@ -981,18 +992,18 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
     @final
     cdef void compute_exact_distances(self) nogil:
         if not self.use_squared_distances:
-            PairwiseDistancesArgKmin.compute_exact_distances(self)
+            PairwiseDistancesArgKmin{{pdr_suffix}}.compute_exact_distances(self)
 
     @final
     cdef void _parallel_on_X_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
-        PairwiseDistancesArgKmin._parallel_on_X_parallel_init(self, thread_num)
+        PairwiseDistancesArgKmin{{pdr_suffix}}._parallel_on_X_parallel_init(self, thread_num)
 
         # Temporary buffer for the `-2 * X_c @ Y_c.T` term
-        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+        self.dist_middle_terms_chunks[thread_num] = <{{DTYPE_t}} *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof({{DTYPE_t}})
         )
 
     @final
@@ -1000,7 +1011,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         self,
         ITYPE_t thread_num
     ) nogil:
-        PairwiseDistancesArgKmin._parallel_on_X_parallel_finalize(self, thread_num)
+        PairwiseDistancesArgKmin{{pdr_suffix}}._parallel_on_X_parallel_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
     @final
@@ -1008,12 +1019,12 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         self,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin._parallel_on_Y_init(self)
+        PairwiseDistancesArgKmin{{pdr_suffix}}._parallel_on_Y_init(self)
 
         for thread_num in range(self.chunks_n_threads):
             # Temporary buffer for the `-2 * X_c @ Y_c.T` term
-            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+            self.dist_middle_terms_chunks[thread_num] = <{{DTYPE_t}} *> malloc(
+                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof({{DTYPE_t}})
             )
 
     @final
@@ -1021,7 +1032,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         self,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin._parallel_on_Y_finalize(self)
+        PairwiseDistancesArgKmin{{pdr_suffix}}._parallel_on_Y_finalize(self)
 
         for thread_num in range(self.chunks_n_threads):
             free(self.dist_middle_terms_chunks[thread_num])
@@ -1038,10 +1049,10 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
         cdef:
             ITYPE_t i, j
 
-            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
-            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
-            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            const {{DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :]
+            const {{DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            {{DTYPE_t}} *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+            {{DTYPE_t}} *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
             # Careful: LDA, LDB and LDC are given for F-ordered arrays
@@ -1055,15 +1066,15 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
             ITYPE_t m = X_c.shape[0]
             ITYPE_t n = Y_c.shape[0]
             ITYPE_t K = X_c.shape[1]
-            DTYPE_t alpha = - 2.
+            {{DTYPE_t}} alpha = - 2.
             # Casting for A and B to remove the const is needed because APIs exposed via
             # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
             # See: https://github.com/scipy/scipy/issues/14262
-            DTYPE_t * A = <DTYPE_t*> & X_c[0, 0]
+            {{DTYPE_t}} * A = <{{DTYPE_t}}*> & X_c[0, 0]
             ITYPE_t lda = X_c.shape[1]
-            DTYPE_t * B = <DTYPE_t*> & Y_c[0, 0]
+            {{DTYPE_t}} * B = <{{DTYPE_t}}*> & Y_c[0, 0]
             ITYPE_t ldb = X_c.shape[1]
-            DTYPE_t beta = 0.
+            {{DTYPE_t}} beta = 0.
             ITYPE_t ldc = Y_c.shape[0]
 
         # dist_middle_terms = `-2 * X_c @ Y_c.T`
@@ -1088,3 +1099,5 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
                     ),
                     j + Y_start,
                 )
+
+{{endfor}}
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index f9203c7789979..6252c8abe0a06 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -24,6 +24,7 @@ def configuration(parent_package="", top_path=None):
     templates = [
         "sklearn/metrics/_dist_metrics.pyx.tp",
         "sklearn/metrics/_dist_metrics.pxd.tp",
+        "sklearn/metrics/_pairwise_distances_reduction.pyx.tp",
     ]
 
     gen_from_templates(templates)

From 451f3d5bd10bb2914cf7196d8d86fd972bc2e3a8 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 24 Feb 2022 11:32:45 +0100
Subject: [PATCH 03/26] MAINT Make PairwiseDistances{Reduction,ArgKmin} facades

This allows keeping the same interface in Python namely:

 - PairwiseDistancesReduction.is_usable_for
 - PairwiseDistancesReduction.valid_metrics
 - PairwiseDistancesArgKmin.compute

while being able to route to the 32bit and 64bit implementations
defined via Tempita.

The design pattern used here on PairwiseDistancesReduction
and PairwiseDistancesArgKmin is the Facade design pattern.

See: https://refactoring.guru/design-patterns/facade
---
 sklearn/metrics/_dist_metrics.pxd.tp          |  15 +-
 sklearn/metrics/_dist_metrics.pyx.tp          | 126 +++---
 .../_pairwise_distances_reduction.pyx.tp      | 380 ++++++++++++------
 .../test_pairwise_distances_reduction.py      |  67 ++-
 4 files changed, 390 insertions(+), 198 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp
index 8435597c7a186..ca81e978ba154 100644
--- a/sklearn/metrics/_dist_metrics.pxd.tp
+++ b/sklearn/metrics/_dist_metrics.pxd.tp
@@ -1,8 +1,15 @@
 {{py:
 
-dtypes = [
-    ('', 'DTYPE_t'),
-    ('32', 'np.float32_t'),
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    #       name_suffix, bitness, DTYPE_t, DTYPE
+    #
+    # We use an empty string as to still be able to expose the same
+    # API for the reference 64bit implementations.
+    #
+    ('', '64', 'DTYPE_t', 'DTYPE'),
+    ('32', '32', 'np.float32_t', 'np.float32')
 ]
 
 }}
@@ -11,7 +18,7 @@ from libc.math cimport sqrt, exp
 
 from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 
-{{for name_suffix, DTYPE_t in dtypes}}
+{{for name_suffix, bitness, DTYPE_t, DTYPE in implementation_specific_values}}
 
 ######################################################################
 # Inline distance functions
diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp
index 3ecfd0f3155ff..f3689a9adb767 100644
--- a/sklearn/metrics/_dist_metrics.pyx.tp
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -1,9 +1,15 @@
 {{py:
 
-
-dtypes = [
-    ('', 'DTYPE_t'),
-    ('32', 'np.float32_t'),
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    #       name_suffix, bitness, DTYPE_t, DTYPE
+    #
+    # We use an empty string as to still be able to expose the same
+    # API for the reference 64bit implementations.
+    #
+    ('', '64', 'DTYPE_t', 'DTYPE'),
+    ('32', '32', 'np.float32_t', 'np.float32')
 ]
 
 }}
@@ -40,35 +46,6 @@ def newObj(obj):
     return obj.__new__(obj)
 
 
-######################################################################
-# metric mappings
-#  These map from metric id strings to class names
-METRIC_MAPPING = {'euclidean': EuclideanDistance,
-                  'l2': EuclideanDistance,
-                  'minkowski': MinkowskiDistance,
-                  'p': MinkowskiDistance,
-                  'manhattan': ManhattanDistance,
-                  'cityblock': ManhattanDistance,
-                  'l1': ManhattanDistance,
-                  'chebyshev': ChebyshevDistance,
-                  'infinity': ChebyshevDistance,
-                  'seuclidean': SEuclideanDistance,
-                  'mahalanobis': MahalanobisDistance,
-                  'wminkowski': WMinkowskiDistance,
-                  'hamming': HammingDistance,
-                  'canberra': CanberraDistance,
-                  'braycurtis': BrayCurtisDistance,
-                  'matching': MatchingDistance,
-                  'jaccard': JaccardDistance,
-                  'dice': DiceDistance,
-                  'kulsinski': KulsinskiDistance,
-                  'rogerstanimoto': RogersTanimotoDistance,
-                  'russellrao': RussellRaoDistance,
-                  'sokalmichener': SokalMichenerDistance,
-                  'sokalsneath': SokalSneathDistance,
-                  'haversine': HaversineDistance,
-                  'pyfunc': PyFuncDistance}
-
 BOOL_METRICS = [
     "hamming",
     "matching",
@@ -94,7 +71,38 @@ def get_valid_metric_ids(L):
             if (val.__name__ in L) or (val in L)]
 
 
-{{for name_suffix, DTYPE_t in dtypes}}
+{{for name_suffix, bitness, DTYPE_t, DTYPE in implementation_specific_values}}
+
+######################################################################
+# metric mappings
+#  These map from metric id strings to class names
+METRIC_MAPPING{{name_suffix}} = {
+    'euclidean': EuclideanDistance{{name_suffix}},
+    'l2': EuclideanDistance{{name_suffix}},
+    'minkowski': MinkowskiDistance{{name_suffix}},
+    'p': MinkowskiDistance{{name_suffix}},
+    'manhattan': ManhattanDistance{{name_suffix}},
+    'cityblock': ManhattanDistance{{name_suffix}},
+    'l1': ManhattanDistance{{name_suffix}},
+    'chebyshev': ChebyshevDistance{{name_suffix}},
+    'infinity': ChebyshevDistance{{name_suffix}},
+    'seuclidean': SEuclideanDistance{{name_suffix}},
+    'mahalanobis': MahalanobisDistance{{name_suffix}},
+    'wminkowski': WMinkowskiDistance{{name_suffix}},
+    'hamming': HammingDistance{{name_suffix}},
+    'canberra': CanberraDistance{{name_suffix}},
+    'braycurtis': BrayCurtisDistance{{name_suffix}},
+    'matching': MatchingDistance{{name_suffix}},
+    'jaccard': JaccardDistance{{name_suffix}},
+    'dice': DiceDistance{{name_suffix}},
+    'kulsinski': KulsinskiDistance{{name_suffix}},
+    'rogerstanimoto': RogersTanimotoDistance{{name_suffix}},
+    'russellrao': RussellRaoDistance{{name_suffix}},
+    'sokalmichener': SokalMichenerDistance{{name_suffix}},
+    'sokalsneath': SokalSneathDistance{{name_suffix}},
+    'haversine': HaversineDistance{{name_suffix}},
+    'pyfunc': PyFuncDistance{{name_suffix}},
+}
 
 cdef inline np.ndarray _buffer_to_ndarray{{name_suffix}}(const {{DTYPE_t}}* x, np.npy_intp n):
     # Wrap a memory buffer with an ndarray. Warning: this is not robust.
@@ -227,8 +235,8 @@ cdef class DistanceMetric{{name_suffix}}:
     """
     def __cinit__(self):
         self.p = 2
-        self.vec = np.zeros(1, dtype=DTYPE, order='C')
-        self.mat = np.zeros((1, 1), dtype=DTYPE, order='C')
+        self.vec = np.zeros(1, dtype={{DTYPE}}, order='C')
+        self.mat = np.zeros((1, 1), dtype={{DTYPE}}, order='C')
         self.size = 1
 
     def __reduce__(self):
@@ -241,7 +249,7 @@ cdef class DistanceMetric{{name_suffix}}:
         """
         get state for pickling
         """
-        if self.__class__.__name__ == "PyFuncDistance":
+        if self.__class__.__name__ == "PyFuncDistance{{name_suffix}}":
             return (float(self.p), np.asarray(self.vec), np.asarray(self.mat), self.func, self.kwargs)
         return (float(self.p), np.asarray(self.vec), np.asarray(self.mat))
 
@@ -252,7 +260,7 @@ cdef class DistanceMetric{{name_suffix}}:
         self.p = state[0]
         self.vec = ReadonlyArrayWrapper(state[1])
         self.mat = ReadonlyArrayWrapper(state[2])
-        if self.__class__.__name__ == "PyFuncDistance":
+        if self.__class__.__name__ == "PyFuncDistance{{name_suffix}}":
             self.func = state[3]
             self.kwargs = state[4]
         self.size = self.vec.shape[0]
@@ -270,39 +278,39 @@ cdef class DistanceMetric{{name_suffix}}:
         **kwargs
             additional arguments will be passed to the requested metric
         """
-        if isinstance(metric, DistanceMetric):
+        if isinstance(metric, DistanceMetric{{name_suffix}}):
             return metric
 
         if callable(metric):
-            return PyFuncDistance(metric, **kwargs)
+            return PyFuncDistance{{name_suffix}}(metric, **kwargs)
 
         # Map the metric string ID to the metric class
-        if isinstance(metric, type) and issubclass(metric, DistanceMetric):
+        if isinstance(metric, type) and issubclass(metric, DistanceMetric{{name_suffix}}):
             pass
         else:
             try:
-                metric = METRIC_MAPPING[metric]
+                metric = METRIC_MAPPING{{name_suffix}}[metric]
             except:
                 raise ValueError("Unrecognized metric '%s'" % metric)
 
         # In Minkowski special cases, return more efficient methods
-        if metric is MinkowskiDistance:
+        if metric is MinkowskiDistance{{name_suffix}}:
             p = kwargs.pop('p', 2)
             w = kwargs.pop('w', None)
             if p == 1 and w is None:
-                return ManhattanDistance(**kwargs)
+                return ManhattanDistance{{name_suffix}}(**kwargs)
             elif p == 2 and w is None:
-                return EuclideanDistance(**kwargs)
+                return EuclideanDistance{{name_suffix}}(**kwargs)
             elif np.isinf(p) and w is None:
-                return ChebyshevDistance(**kwargs)
+                return ChebyshevDistance{{name_suffix}}(**kwargs)
             else:
-                return MinkowskiDistance(p, w, **kwargs)
+                return MinkowskiDistance{{name_suffix}}(p, w, **kwargs)
         else:
             return metric(**kwargs)
 
     def __init__(self):
-        if self.__class__ is DistanceMetric:
-            raise NotImplementedError("DistanceMetric is an abstract class")
+        if self.__class__ is DistanceMetric{{name_suffix}}:
+            raise NotImplementedError("DistanceMetric{{name_suffix}} is an abstract class")
 
     def _validate_data(self, X):
         """Validate the input data.
@@ -484,7 +492,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
        D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} }
     """
     def __init__(self, V):
-        self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype=DTYPE))
+        self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype={{DTYPE}}))
         self.size = self.vec.shape[0]
         self.p = 2
 
@@ -616,7 +624,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             self.vec = ReadonlyArrayWrapper(w_array)
             self.size = self.vec.shape[0]
         else:
-            self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=DTYPE))
+            self.vec = ReadonlyArrayWrapper(np.asarray([], dtype={{DTYPE}}))
             self.size = 0
 
     def _validate_data(self, X):
@@ -691,7 +699,7 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             raise ValueError("WMinkowskiDistance requires finite p. "
                              "For p=inf, use ChebyshevDistance.")
         self.p = p
-        self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype=DTYPE))
+        self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype={{DTYPE}}))
         self.size = self.vec.shape[0]
 
     def _validate_data(self, X):
@@ -1240,7 +1248,7 @@ cdef class DatasetsPair{{name_suffix}}:
         Y,
         str metric="euclidean",
         dict metric_kwargs=None,
-    ) -> DatasetsPair:
+    ) -> DatasetsPair{{name_suffix}}:
         """Return the DatasetsPair implementation for the given arguments.
 
         Parameters
@@ -1270,15 +1278,15 @@ cdef class DatasetsPair{{name_suffix}}:
             The suited DatasetsPair implementation.
         """
         cdef:
-            DistanceMetric distance_metric = DistanceMetric.get_metric(
+            DistanceMetric{{name_suffix}} distance_metric = DistanceMetric{{name_suffix}}.get_metric(
                 metric,
                 **(metric_kwargs or {})
             )
 
-        if not(X.dtype == Y.dtype == np.float64):
+        if not(X.dtype == Y.dtype == np.float{{bitness}}):
             raise ValueError(
-                f"Only 64bit float datasets are supported at this time, "
-                f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}"
+                f"Datasets must be of np.float{{bitness}} type. "
+                f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
             )
 
         # Metric-specific checks that do not replace nor duplicate `check_array`.
@@ -1289,9 +1297,9 @@ cdef class DatasetsPair{{name_suffix}}:
         if issparse(X) or issparse(Y):
             raise ValueError("Only dense datasets are supported for X and Y.")
 
-        return DenseDenseDatasetsPair(X, Y, distance_metric)
+        return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
 
-    def __init__(self, DistanceMetric distance_metric):
+    def __init__(self, DistanceMetric{{name_suffix}} distance_metric):
         self.distance_metric = distance_metric
 
     cdef ITYPE_t n_samples_X(self) nogil:
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index f8716cec1088c..31a8e12929c77 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -1,8 +1,15 @@
 {{py:
 
-dtypes = [
-    ('', '64', 'DTYPE_t'),
-    ('32', '32', 'np.float32_t')
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    #       name_suffix, bitness, DTYPE_t, DTYPE
+    #
+    # We use an empty string as to still be able to expose the same
+    # API for the reference 64bit implementations.
+    #
+    ('', '64', 'DTYPE_t', 'DTYPE'),
+    ('32', '32', 'np.float32_t', 'np.float32')
 ]
 
 }}
@@ -56,36 +63,7 @@ from ..utils._typedefs import ITYPE, DTYPE
 
 np.import_array()
 
-{{for distance_suffix, pdr_suffix, DTYPE_t in dtypes}}
-
-from ._dist_metrics cimport DatasetsPair{{distance_suffix}}, DenseDenseDatasetsPair{{distance_suffix}}
-
-cpdef {{DTYPE_t}}[::1] _sqeuclidean_row_norms{{pdr_suffix}}(
-    const {{DTYPE_t}}[:, ::1] X,
-    ITYPE_t num_threads,
-):
-    """Compute the squared euclidean norm of the rows of X in parallel.
-
-    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
-    """
-    cdef:
-        # Casting for X to remove the const qualifier is needed because APIs
-        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
-        # const qualifier.
-        # See: https://github.com/scipy/scipy/issues/14262
-        {{DTYPE_t}} * X_ptr = <{{DTYPE_t}} *> &X[0, 0]
-        ITYPE_t idx = 0
-        ITYPE_t n = X.shape[0]
-        ITYPE_t d = X.shape[1]
-        {{DTYPE_t}}[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
-
-    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
-        squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
-
-    return squared_row_norms
-
-
-cdef class PairwiseDistancesReduction{{pdr_suffix}}:
+cdef class PairwiseDistancesReduction:
     """Abstract base class for pairwise distance computation & reduction.
 
     Subclasses of this class compute pairwise distances between a set of
@@ -113,7 +91,7 @@ cdef class PairwiseDistancesReduction{{pdr_suffix}}:
 
     Parameters
     ----------
-    datasets_pair: DatasetsPair{{distance_suffix}}
+    datasets_pair: DatasetsPair
         The pair of dataset to use.
 
     chunk_size: int, default=None
@@ -158,32 +136,6 @@ cdef class PairwiseDistancesReduction{{pdr_suffix}}:
           `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
     """
 
-    cdef:
-        readonly DatasetsPair{{distance_suffix}} datasets_pair
-
-        # The number of threads that can be used is stored in effective_n_threads.
-        #
-        # The number of threads to use in the parallelisation strategy
-        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
-        # for small datasets, less threads might be needed to loop over pair of chunks.
-        #
-        # Hence the number of threads that _will_ be used for looping over chunks
-        # is stored in chunks_n_threads, allowing solely using what we need.
-        #
-        # Thus, an invariant is:
-        #
-        #                 chunks_n_threads <= effective_n_threads
-        #
-        ITYPE_t effective_n_threads
-        ITYPE_t chunks_n_threads
-
-        ITYPE_t n_samples_chunk, chunk_size
-
-        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
-        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
-
-        bint execute_in_parallel_on_Y
-
     @classmethod
     def valid_metrics(cls) -> List[str]:
         excluded = {
@@ -198,6 +150,11 @@ cdef class PairwiseDistancesReduction{{pdr_suffix}}:
         }
         return sorted(set(METRIC_MAPPING.keys()) - excluded)
 
+    @classmethod
+    def valid_dtypes(cls):
+        return (np.float32, np.float64)
+
+
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
         """Return True if the PairwiseDistancesReduction can be used for the given parameters.
@@ -219,12 +176,236 @@ cdef class PairwiseDistancesReduction{{pdr_suffix}}:
         -------
         True if the PairwiseDistancesReduction can be used, else False.
         """
-        # TODO: support sparse arrays and 32 bits
+        dtypes_validity = X.dtype == Y.dtype and Y.dtype in cls.valid_dtypes()
         return (get_config().get("enable_cython_pairwise_dist", True) and
-                not issparse(X) and X.dtype == np.float64 and
-                not issparse(Y) and Y.dtype == np.float64 and
+                not issparse(X) and not issparse(Y) and dtypes_validity and
                 metric in cls.valid_metrics())
 
+
+cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
+    """Compute the argkmin of row vectors of X on the ones of Y.
+
+    For each row vector of X, computes the indices of k first the rows
+    vectors of Y with the smallest distances.
+
+    PairwiseDistancesArgKmin is typically used to perform
+    bruteforce k-nearest neighbors queries.
+
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The dataset pairs (X, Y) for the reduction.
+
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use 256 if it is not set.
+
+    n_threads: int, default=None
+        The number of OpenMP threads to use for the reduction.
+        Parallelism is done on chunks and the sharding of chunks
+        depends on the `strategy` set on
+        :meth:`~PairwiseDistancesArgKmin.compute`.
+
+        See _openmp_effective_n_threads, for details about
+        the specification of n_threads.
+
+    k: int, default=1
+        The k for the argkmin reduction.
+    """
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        ITYPE_t k,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        n_threads=None,
+        str strategy=None,
+        bint return_distance=False,
+    ):
+        """Return the results of the reduction for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        k : int
+            The k for the argkmin reduction.
+
+        metric : str, default='euclidean'
+            The distance metric to use for argkmin.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        n_threads : int, default=None
+            The number of OpenMP threads to use for the reduction.
+            Parallelism is done on chunks and the sharding of chunks
+            depends on the `strategy` set on
+            :meth:`~PairwiseDistancesArgKmin.compute`.
+
+            See _openmp_effective_n_threads, for details about
+            the specification of n_threads.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+              Each thread then iterates on all the chunks of Y. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+              Each thread processes all the chunks of X in turn. This strategy is
+              a sequence of embarrassingly parallel subtasks (the inner loop on Y
+              chunks) with intermediate datastructures synchronisation at each
+              iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+              'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+              'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]`
+              is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity
+              for parallelism and is therefore more efficient despite the synchronization
+              step at each iteration of the outer loop on chunks of `X`.
+
+              - None (default) looks-up in scikit-learn configuration for
+              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its
+            argkmin if set to True.
+
+        Returns
+        -------
+            If return_distance=False:
+              - argkmin_indices : ndarray of shape (n_samples_X, k)
+                Indices of the argkmin for each vector in X.
+
+            If return_distance=True:
+              - argkmin_distances : ndarray of shape (n_samples_X, k)
+                Distances to the argkmin for each vector in X.
+              - argkmin_indices : ndarray of shape (n_samples_X, k)
+                Indices of the argkmin for each vector in X.
+
+        Notes
+        -----
+            This public classmethod is responsible for introspecting the arguments
+            values to dispatch to the proper implementations of
+            :meth:`PairwiseDistancesArgKmin` given the types.
+
+            All temporarily allocated datastructures necessary for the concrete
+            implementation are therefore freed when this classmethod returns.
+
+            This allows decoupling the interface entirely from the
+            implementation details whilst maintaining RAII.
+        """
+        if X.dtype == Y.dtype == np.float64:
+            return PairwiseDistancesArgKmin64.compute(
+                X=X,
+                Y=Y,
+                k=k,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                n_threads=n_threads,
+                strategy=strategy,
+                return_distance=return_distance,
+            )
+        if X.dtype == Y.dtype == np.float32:
+            return PairwiseDistancesArgKmin32.compute(
+                X=X,
+                Y=Y,
+                k=k,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                n_threads=n_threads,
+                strategy=strategy,
+                return_distance=return_distance,
+            )
+        raise ValueError(
+            "No implementation exist for fused-typed datasets pair. "
+            f"Currently X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
+
+
+{{for distance_suffix, bitness, DTYPE_t, DTYPE in implementation_specific_values}}
+
+from ._dist_metrics cimport DatasetsPair{{distance_suffix}}, DenseDenseDatasetsPair{{distance_suffix}}
+
+cpdef {{DTYPE_t}}[::1] _sqeuclidean_row_norms{{bitness}}(
+    const {{DTYPE_t}}[:, ::1] X,
+    ITYPE_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        # See: https://github.com/scipy/scipy/issues/14262
+        {{DTYPE_t}} * X_ptr = <{{DTYPE_t}} *> &X[0, 0]
+        ITYPE_t idx = 0
+        ITYPE_t n = X.shape[0]
+        ITYPE_t d = X.shape[1]
+        {{DTYPE_t}}[::1] squared_row_norms = np.empty(n, dtype={{DTYPE}})
+
+    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
+
+    return squared_row_norms
+
+
+cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
+    """{{bitness}}bit implementation of PairwiseDistancesReduction."""
+
+    cdef:
+        readonly DatasetsPair{{distance_suffix}} datasets_pair
+
+        # The number of threads that can be used is stored in effective_n_threads.
+        #
+        # The number of threads to use in the parallelisation strategy
+        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
+        # for small datasets, less threads might be needed to loop over pair of chunks.
+        #
+        # Hence the number of threads that _will_ be used for looping over chunks
+        # is stored in chunks_n_threads, allowing solely using what we need.
+        #
+        # Thus, an invariant is:
+        #
+        #                 chunks_n_threads <= effective_n_threads
+        #
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+
+        ITYPE_t n_samples_chunk, chunk_size
+
+        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
+        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
+
+        bint execute_in_parallel_on_Y
+
     def __init__(
         self,
         DatasetsPair{{distance_suffix}} datasets_pair,
@@ -426,7 +607,7 @@ cdef class PairwiseDistancesReduction{{pdr_suffix}}:
     ) nogil:
         """Compute the pairwise distances on two chunks of X and Y and reduce them.
 
-        This is THE core computational method of PairwiseDistanceReductions{{pdr_suffix}}.
+        This is THE core computational method of PairwiseDistanceReductions{{bitness}}.
         This must be implemented in subclasses.
         """
         return
@@ -502,37 +683,8 @@ cdef class PairwiseDistancesReduction{{pdr_suffix}}:
         """Update datastructures after executing all the reductions."""
         return
 
-cdef class PairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistancesReduction{{pdr_suffix}}):
-    """Compute the argkmin of row vectors of X on the ones of Y.
-
-    For each row vector of X, computes the indices of k first the rows
-    vectors of Y with the smallest distances.
-
-    PairwiseDistancesArgKmin is typically used to perform
-    bruteforce k-nearest neighbors queries.
-
-    Parameters
-    ----------
-    datasets_pair: DatasetsPair
-        The dataset pairs (X, Y) for the reduction.
-
-    chunk_size: int, default=None,
-        The number of vectors per chunk. If None (default) looks-up in
-        scikit-learn configuration for `pairwise_dist_chunk_size`,
-        and use 256 if it is not set.
-
-    n_threads: int, default=None
-        The number of OpenMP threads to use for the reduction.
-        Parallelism is done on chunks and the sharding of chunks
-        depends on the `strategy` set on
-        :meth:`~PairwiseDistancesArgKmin{{distance_suffix}}.compute`.
-
-        See _openmp_effective_n_threads, for details about
-        the specification of n_threads.
-
-    k: int, default=1
-        The k for the argkmin reduction.
-    """
+cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitness}}):
+    """{{bitness}}bit implementation of PairwiseDistancesArgKmin."""
 
     cdef:
         ITYPE_t k
@@ -587,7 +739,7 @@ cdef class PairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistancesReduction{{pd
             The number of OpenMP threads to use for the reduction.
             Parallelism is done on chunks and the sharding of chunks
             depends on the `strategy` set on
-            :meth:`~PairwiseDistancesArgKmin{{pdr_suffix}}.compute`.
+            :meth:`~PairwiseDistancesArgKmin{{bitness}}.compute`.
 
             See _openmp_effective_n_threads, for details about
             the specification of n_threads.
@@ -639,14 +791,14 @@ cdef class PairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistancesReduction{{pd
         Notes
         -----
             This public classmethod is responsible for introspecting the arguments
-            values to dispatch to the private :meth:`PairwiseDistancesArgKmin{{pdr_suffix}}._compute`
-            instance method of the most appropriate :class:`PairwiseDistancesArgKmin{{pdr_suffix}}`
+            values to dispatch to the private :meth:`PairwiseDistancesArgKmin{{bitness}}._compute`
+            instance method of the most appropriate :class:`PairwiseDistancesArgKmin{{bitness}}`
             concrete implementation.
 
             All temporarily allocated datastructures necessary for the concrete
             implementation are therefore freed when this classmethod returns.
 
-            This allows entirely decoupling the interface entirely from the
+            This allows decoupling the interface entirely from the
             implementation details whilst maintaining RAII.
         """
         # Note (jjerphan): Some design thoughts for future extensions.
@@ -664,7 +816,7 @@ cdef class PairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistancesReduction{{pd
             # at time to leverage a call to the BLAS GEMM routine as explained
             # in more details in the docstring.
             use_squared_distances = metric == "sqeuclidean"
-            pda = FastEuclideanPairwiseDistancesArgKmin{{pdr_suffix}}(
+            pda = FastEuclideanPairwiseDistancesArgKmin{{bitness}}(
                 X=X, Y=Y, k=k,
                 use_squared_distances=use_squared_distances,
                 chunk_size=chunk_size,
@@ -674,7 +826,7 @@ cdef class PairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistancesReduction{{pd
         else:
              # Fall back on a generic implementation that handles most scipy
              # metrics by computing the distances between 2 vectors at a time.
-            pda = PairwiseDistancesArgKmin{{pdr_suffix}}(
+            pda = PairwiseDistancesArgKmin{{bitness}}(
                 datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric, metric_kwargs),
                 k=k,
                 chunk_size=chunk_size,
@@ -723,9 +875,9 @@ cdef class PairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistancesReduction{{pd
             sizeof(ITYPE_t *) * self.chunks_n_threads
         )
 
-        # Main heaps which will be returned as results by `PairwiseDistancesArgKmin{{pdr_suffix}}.compute`.
+        # Main heaps which will be returned as results by `PairwiseDistancesArgKmin{{bitness}}.compute`.
         self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
-        self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
+        self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype={{DTYPE}})
 
     def __dealloc__(self):
         if self.heaps_indices_chunks is not NULL:
@@ -896,14 +1048,14 @@ cdef class PairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistancesReduction{{pd
 
             # Values are returned identically to the way `KNeighborsMixin.kneighbors`
             # returns values. This is counter-intuitive but this allows not using
-            # complex adaptations where `PairwiseDistancesArgKmin{{pdr_suffix}}.compute` is called.
+            # complex adaptations where `PairwiseDistancesArgKmin{{bitness}}.compute` is called.
             return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
         return np.asarray(self.argkmin_indices)
 
 
-cdef class FastEuclideanPairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistancesArgKmin{{pdr_suffix}}):
-    """Fast specialized alternative for PairwiseDistancesArgKmin{{pdr_suffix}} on EuclideanDistance.
+cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArgKmin{{bitness}}):
+    """Fast specialized alternative for PairwiseDistancesArgKmin{{bitness}} on EuclideanDistance.
 
     The full pairwise squared distances matrix is computed as follows:
 
@@ -931,7 +1083,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistances
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        return (PairwiseDistancesArgKmin{{pdr_suffix}}.is_usable_for(X, Y, metric) and
+        return (PairwiseDistancesArgKmin{{bitness}}.is_usable_for(X, Y, metric) and
                 not _in_unstable_openblas_configuration())
 
     def __init__(
@@ -971,12 +1123,12 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistances
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared")
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms{{pdr_suffix}}(self.Y, self.effective_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms{{bitness}}(self.Y, self.effective_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms{{pdr_suffix}}(self.X, self.effective_n_threads)
+            _sqeuclidean_row_norms{{bitness}}(self.X, self.effective_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 
@@ -992,14 +1144,14 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistances
     @final
     cdef void compute_exact_distances(self) nogil:
         if not self.use_squared_distances:
-            PairwiseDistancesArgKmin{{pdr_suffix}}.compute_exact_distances(self)
+            PairwiseDistancesArgKmin{{bitness}}.compute_exact_distances(self)
 
     @final
     cdef void _parallel_on_X_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
-        PairwiseDistancesArgKmin{{pdr_suffix}}._parallel_on_X_parallel_init(self, thread_num)
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_parallel_init(self, thread_num)
 
         # Temporary buffer for the `-2 * X_c @ Y_c.T` term
         self.dist_middle_terms_chunks[thread_num] = <{{DTYPE_t}} *> malloc(
@@ -1011,7 +1163,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistances
         self,
         ITYPE_t thread_num
     ) nogil:
-        PairwiseDistancesArgKmin{{pdr_suffix}}._parallel_on_X_parallel_finalize(self, thread_num)
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_parallel_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
 
     @final
@@ -1019,7 +1171,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistances
         self,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin{{pdr_suffix}}._parallel_on_Y_init(self)
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_init(self)
 
         for thread_num in range(self.chunks_n_threads):
             # Temporary buffer for the `-2 * X_c @ Y_c.T` term
@@ -1032,7 +1184,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{pdr_suffix}}(PairwiseDistances
         self,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin{{pdr_suffix}}._parallel_on_Y_finalize(self)
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_finalize(self)
 
         for thread_num in range(self.chunks_n_threads):
             free(self.dist_middle_terms_chunks[thread_num])
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index b9f3d7dbf3dd5..75c1c90a71466 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -7,7 +7,8 @@
 from sklearn.metrics._pairwise_distances_reduction import (
     PairwiseDistancesReduction,
     PairwiseDistancesArgKmin,
-    _sqeuclidean_row_norms,
+    _sqeuclidean_row_norms64,
+    _sqeuclidean_row_norms32,
 )
 
 from sklearn.metrics import euclidean_distances
@@ -64,7 +65,7 @@ def _get_dummy_metric_params_list(metric: str, n_features: int):
     return [{}]
 
 
-def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
+def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1e-7):
     assert_array_equal(
         ref_indices,
         indices,
@@ -74,7 +75,7 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
         ref_dist,
         dist,
         err_msg="Query vectors have different neighbors' distances",
-        rtol=1e-7,
+        rtol=rtol,
     )
 
 
@@ -88,13 +89,18 @@ def test_pairwise_distances_reduction_is_usable_for():
     X = rng.rand(100, 10)
     Y = rng.rand(100, 10)
     metric = "euclidean"
-    assert PairwiseDistancesReduction.is_usable_for(X, Y, metric)
+
+    assert PairwiseDistancesReduction.is_usable_for(
+        X.astype(np.float64), X.astype(np.float64), metric
+    )
+    assert PairwiseDistancesReduction.is_usable_for(
+        X.astype(np.float32), X.astype(np.float32), metric
+    )
     assert not PairwiseDistancesReduction.is_usable_for(
         X.astype(np.int64), Y.astype(np.int64), metric
     )
 
     assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc")
-    # TODO: remove once 32 bits datasets are supported
     assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric)
     assert not PairwiseDistancesReduction.is_usable_for(X, Y.astype(np.int32), metric)
 
@@ -111,8 +117,8 @@ def test_argkmin_factory_method_wrong_usages():
     metric = "euclidean"
 
     msg = (
-        "Only 64bit float datasets are supported at this time, "
-        "got: X.dtype=float32 and Y.dtype=float64"
+        "No implementation exist for fused-typed datasets pair. "
+        "Currently X.dtype=float32 and Y.dtype=float64."
     )
     with pytest.raises(ValueError, match=msg):
         PairwiseDistancesArgKmin.compute(
@@ -120,8 +126,8 @@ def test_argkmin_factory_method_wrong_usages():
         )
 
     msg = (
-        "Only 64bit float datasets are supported at this time, "
-        "got: X.dtype=float64 and Y.dtype=int32"
+        "No implementation exist for fused-typed datasets pair. "
+        "Currently X.dtype=float64 and Y.dtype=int32."
     )
     with pytest.raises(ValueError, match=msg):
         PairwiseDistancesArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
@@ -151,6 +157,7 @@ def test_argkmin_factory_method_wrong_usages():
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
+@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
     [PairwiseDistancesArgKmin],
@@ -160,8 +167,8 @@ def test_chunk_size_agnosticism(
     seed,
     n_samples,
     chunk_size,
+    dtype,
     n_features=100,
-    dtype=np.float64,
 ):
     # Results should not depend on the chunk size
     rng = np.random.RandomState(seed)
@@ -191,12 +198,16 @@ def test_chunk_size_agnosticism(
         return_distance=True,
     )
 
-    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
+    rtol = 1e-7 if dtype is np.float64 else 1e-6
+    ASSERT_RESULT[PairwiseDistancesReduction](
+        ref_dist, dist, ref_indices, indices, rtol
+    )
 
 
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
+@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
     [PairwiseDistancesArgKmin],
@@ -206,8 +217,8 @@ def test_n_threads_agnosticism(
     seed,
     n_samples,
     chunk_size,
+    dtype,
     n_features=100,
-    dtype=np.float64,
 ):
     # Results should not depend on the number of threads
     rng = np.random.RandomState(seed)
@@ -233,7 +244,10 @@ def test_n_threads_agnosticism(
         X, Y, parameter, n_threads=1, return_distance=True
     )
 
-    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
+    rtol = 1e-7 if dtype is np.float64 else 1e-6
+    ASSERT_RESULT[PairwiseDistancesReduction](
+        ref_dist, dist, ref_indices, indices, rtol
+    )
 
 
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@@ -241,6 +255,7 @@ def test_n_threads_agnosticism(
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
+@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
     [PairwiseDistancesArgKmin],
@@ -250,8 +265,8 @@ def test_strategies_consistency(
     metric,
     n_samples,
     seed,
+    dtype,
     n_features=10,
-    dtype=np.float64,
 ):
 
     rng = np.random.RandomState(seed)
@@ -297,11 +312,13 @@ def test_strategies_consistency(
         return_distance=True,
     )
 
+    rtol = 1e-7 if dtype is np.float64 else 1e-6
     ASSERT_RESULT[PairwiseDistancesReduction](
         dist_par_X,
         dist_par_Y,
         indices_par_X,
         indices_par_Y,
+        rtol,
     )
 
 
@@ -312,15 +329,16 @@ def test_strategies_consistency(
 @pytest.mark.parametrize("n_features", [50, 500])
 @pytest.mark.parametrize("translation", [0, 1e6])
 @pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
+@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
 def test_pairwise_distances_argkmin(
     n_features,
     translation,
     metric,
     strategy,
+    dtype,
     n_samples=100,
     k=10,
-    dtype=np.float64,
 ):
     rng = np.random.RandomState(0)
     spread = 1000
@@ -361,8 +379,13 @@ def test_pairwise_distances_argkmin(
         strategy=strategy,
     )
 
+    rtol = 1e-7 if dtype is np.float64 else 1e-6
     ASSERT_RESULT[PairwiseDistancesArgKmin](
-        argkmin_distances, argkmin_distances_ref, argkmin_indices, argkmin_indices_ref
+        argkmin_distances,
+        argkmin_distances_ref,
+        argkmin_indices,
+        argkmin_indices_ref,
+        rtol,
     )
 
 
@@ -375,13 +398,15 @@ def test_sqeuclidean_row_norms(
     n_samples,
     n_features,
     num_threads,
-    dtype=np.float64,
 ):
     rng = np.random.RandomState(seed)
     spread = 100
-    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    X64 = rng.rand(n_samples, n_features).astype(np.float64) * spread
+    X32 = rng.rand(n_samples, n_features).astype(np.float32) * spread
 
-    sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
-    sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads))
+    sq_row_norm_reference = np.linalg.norm(X64, axis=1) ** 2
+    sq_row_norm64 = np.asarray(_sqeuclidean_row_norms64(X64, num_threads=num_threads))
+    sq_row_norm32 = np.asarray(_sqeuclidean_row_norms32(X32, num_threads=num_threads))
 
-    assert_allclose(sq_row_norm_reference, sq_row_norm)
+    assert_allclose(sq_row_norm_reference, sq_row_norm32)
+    assert_allclose(sq_row_norm_reference, sq_row_norm64)

From 327d2490e6044f137dd44c141f611456463c8f45 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 25 Feb 2022 11:59:49 +0100
Subject: [PATCH 04/26] TST Fix test_sqeuclidean_row_norms

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 07b86bcb5a960..e295d2e897260 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -404,11 +404,11 @@ def test_sqeuclidean_row_norms(
     rng = np.random.RandomState(seed)
     spread = 100
     X64 = rng.rand(n_samples, n_features).astype(np.float64) * spread
-    X32 = rng.rand(n_samples, n_features).astype(np.float32) * spread
+    X32 = X64.astype(np.float32)
 
     sq_row_norm_reference = np.linalg.norm(X64, axis=1) ** 2
     sq_row_norm64 = np.asarray(_sqeuclidean_row_norms64(X64, num_threads=num_threads))
     sq_row_norm32 = np.asarray(_sqeuclidean_row_norms32(X32, num_threads=num_threads))
 
-    assert_allclose(sq_row_norm_reference, sq_row_norm32)
-    assert_allclose(sq_row_norm_reference, sq_row_norm64)
+    assert_allclose(sq_row_norm_reference, sq_row_norm64, rtol=1e-7)
+    assert_allclose(sq_row_norm_reference, sq_row_norm32, rtol=1e-6)

From baf2fc667d3cf198695d3b709ed19c4b41e8e8c9 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 25 Feb 2022 15:05:22 +0100
Subject: [PATCH 05/26] TST Add fixture to test quasi-equality for 32bit

---
 .../test_pairwise_distances_reduction.py      | 80 ++++++++++++++++---
 1 file changed, 67 insertions(+), 13 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index e295d2e897260..529c2228b5ddf 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -80,8 +80,68 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1
     )
 
 
+def assert_argkmin_results_quasi_equality(
+    ref_dist, dist, ref_indices, indices, rtol=1e-4
+):
+
+    ref_dist, dist, ref_indices, indices = map(
+        np.ndarray.flatten, [ref_dist, dist, ref_indices, indices]
+    )
+
+    assert (
+        len(ref_dist) == len(dist) == len(ref_indices) == len(indices)
+    ), "Arrays of results have various length."
+
+    n = len(ref_dist)
+
+    skip_permutation_check = False
+
+    for i in range(n - 1):
+        # We test the equality of pair of adjacent indices and distances
+        # of the references against the results.
+        rd_current, rd_next = ref_dist[i], ref_dist[i + 1]
+        d_current, d_next = dist[i], dist[i + 1]
+        ri_current, ri_next = ref_indices[i], ref_indices[i + 1]
+        i_current, i_next = indices[i], indices[i + 1]
+
+        assert np.isclose(
+            d_current, rd_current, rtol=rtol
+        ), "Query vectors have different neighbors' distances"
+
+        if ri_current != i_current:
+            # If the current reference index and index are different,
+            # it might be that their were permuted because their distances
+            # are relatively close to each other.
+            # In this case, we need to check for a valid permutation.
+            valid_permutation = (
+                np.isclose(d_current, d_next, rtol=rtol)
+                and i_next == ri_current
+                and ri_next == i_current
+            )
+            assert skip_permutation_check or valid_permutation, (
+                "Query vectors have different neighbors' indices \n"
+                f"(i_current, i_next) = {i_current, i_next} \n"
+                f"(ri_current, ri_next) = {ri_current, ri_next} \n"
+                f"(d_current, d_next) = {d_current, d_next} \n"
+                f"(rd_current, rd_next) = {rd_current, rd_next} \n"
+            )
+            # If there's a permutation at this iteration, we need to
+            # skip the following permutation check.
+            skip_permutation_check = True
+            continue
+
+        # We need to check for potential permutations for the next iterations.
+        if skip_permutation_check:
+            skip_permutation_check = False
+
+
 ASSERT_RESULT = {
-    PairwiseDistancesArgKmin: assert_argkmin_results_equality,
+    # In the case of 64bit, we test for exact equality.
+    (PairwiseDistancesArgKmin, np.float64): assert_argkmin_results_equality,
+    # In the case of 32bit, indices can be permuted due to small difference
+    # in the computations of their associated distances, hence we test equality of
+    # results up to valid permutations.
+    (PairwiseDistancesArgKmin, np.float32): assert_argkmin_results_quasi_equality,
 }
 
 
@@ -199,9 +259,8 @@ def test_chunk_size_agnosticism(
         return_distance=True,
     )
 
-    rtol = 1e-7 if dtype is np.float64 else 1e-6
-    ASSERT_RESULT[PairwiseDistancesReduction](
-        ref_dist, dist, ref_indices, indices, rtol
+    ASSERT_RESULT[(PairwiseDistancesArgKmin, dtype)](
+        ref_dist, dist, ref_indices, indices
     )
 
 
@@ -246,9 +305,8 @@ def test_n_threads_agnosticism(
             X, Y, parameter, return_distance=True
         )
 
-    rtol = 1e-7 if dtype is np.float64 else 1e-6
-    ASSERT_RESULT[PairwiseDistancesReduction](
-        ref_dist, dist, ref_indices, indices, rtol
+    ASSERT_RESULT[(PairwiseDistancesArgKmin, dtype)](
+        ref_dist, dist, ref_indices, indices
     )
 
 
@@ -314,13 +372,11 @@ def test_strategies_consistency(
         return_distance=True,
     )
 
-    rtol = 1e-7 if dtype is np.float64 else 1e-6
-    ASSERT_RESULT[PairwiseDistancesReduction](
+    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](
         dist_par_X,
         dist_par_Y,
         indices_par_X,
         indices_par_Y,
-        rtol,
     )
 
 
@@ -381,13 +437,11 @@ def test_pairwise_distances_argkmin(
         strategy=strategy,
     )
 
-    rtol = 1e-7 if dtype is np.float64 else 1e-6
-    ASSERT_RESULT[PairwiseDistancesArgKmin](
+    ASSERT_RESULT[(PairwiseDistancesArgKmin, dtype)](
         argkmin_distances,
         argkmin_distances_ref,
         argkmin_indices,
         argkmin_indices_ref,
-        rtol,
     )
 
 

From 3c80b40c7e73c7882d8819f12d61c24b31cdf377 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 25 Feb 2022 15:29:20 +0100
Subject: [PATCH 06/26] MAINT Do not route 32bit specialize implementation yet

---
 sklearn/metrics/_pairwise_distances_reduction.pyx.tp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index fa0ab2b1f7226..2803e65416ab8 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -770,6 +770,12 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
             metric in ("euclidean", "sqeuclidean")
             and not issparse(X)
             and not issparse(Y)
+            # As of now, we do not route the 32bit case to the specialized
+            # implementation as defining the upcast fixtures for GEMM using
+            # Tempita is non-trivial
+            # TODO: find a way to define the upcasting fixtures with Tempita
+            # for the FastEuclideanPairwiseDistancesArgKmin32.
+            and {{bitness}} == 64
         ):
             # Specialized implementation with improved arithmetic intensity
             # and vector instructions (SIMD) by processing several vectors

From 3d9d5653ab8558a4688a92feef1059a9ebd716da Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 25 Feb 2022 15:41:00 +0100
Subject: [PATCH 07/26] TST Adapt DistanceMetrics tests for 32bit

---
 sklearn/metrics/tests/test_dist_metrics.py | 74 +++++++++++-----------
 1 file changed, 38 insertions(+), 36 deletions(-)

diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
index 6c841d1d44f8c..2bf6b1f4ff259 100644
--- a/sklearn/metrics/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -24,16 +24,18 @@ def dist_func(x1, x2, p):
 d = 4
 n1 = 20
 n2 = 25
-X1 = rng.random_sample((n1, d)).astype("float64", copy=False)
-X2 = rng.random_sample((n2, d)).astype("float64", copy=False)
+X64 = rng.random_sample((n1, d)).astype("float64", copy=False)
+Y64 = rng.random_sample((n2, d)).astype("float64", copy=False)
+X32 = X64.astype("float32")
+Y32 = Y64.astype("float32")
 
-[X1_mmap, X2_mmap] = create_memmap_backed_data([X1, X2])
+[X_mmap, Y_mmap] = create_memmap_backed_data([X64, Y64])
 
 # make boolean arrays: ones and zeros
-X1_bool = X1.round(0)
-X2_bool = X2.round(0)
+X_bool = X64.round(0)
+Y_bool = Y64.round(0)
 
-[X1_bool_mmap, X2_bool_mmap] = create_memmap_backed_data([X1_bool, X2_bool])
+[X_bool_mmap, Y_bool_mmap] = create_memmap_backed_data([X_bool, Y_bool])
 
 
 V = rng.random_sample((d, d))
@@ -65,27 +67,27 @@ def dist_func(x1, x2, p):
     )
 
 
-def check_cdist(metric, kwargs, X1, X2):
+def check_cdist(metric, kwargs, X, Y):
     if metric == "wminkowski":
         # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
         WarningToExpect = None
         if sp_version >= parse_version("1.6.0"):
             WarningToExpect = DeprecationWarning
         with pytest.warns(WarningToExpect):
-            D_scipy_cdist = cdist(X1, X2, metric, **kwargs)
+            D_scipy_cdist = cdist(X, Y, metric, **kwargs)
     else:
-        D_scipy_cdist = cdist(X1, X2, metric, **kwargs)
+        D_scipy_cdist = cdist(X, Y, metric, **kwargs)
 
     dm = DistanceMetric.get_metric(metric, **kwargs)
-    D_sklearn = dm.pairwise(X1, X2)
+    D_sklearn = dm.pairwise(X, Y)
     assert_array_almost_equal(D_sklearn, D_scipy_cdist)
 
 
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
-@pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)])
-def test_cdist(metric_param_grid, X1, X2):
+@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)])
+def test_cdist(metric_param_grid, X, Y):
     metric, param_grid = metric_param_grid
     keys = param_grid.keys()
     for vals in itertools.product(*param_grid.values()):
@@ -96,29 +98,29 @@ def test_cdist(metric_param_grid, X1, X2):
             pytest.xfail(
                 "scipy#13861: cdist with 'mahalanobis' fails on joblib memmap data"
             )
-        check_cdist(metric, kwargs, X1, X2)
+        check_cdist(metric, kwargs, X, Y)
 
 
 @pytest.mark.parametrize("metric", BOOL_METRICS)
 @pytest.mark.parametrize(
-    "X1_bool, X2_bool", [(X1_bool, X2_bool), (X1_bool_mmap, X2_bool_mmap)]
+    "X_bool, Y_bool", [(X_bool, Y_bool), (X_bool_mmap, Y_bool_mmap)]
 )
-def test_cdist_bool_metric(metric, X1_bool, X2_bool):
-    D_true = cdist(X1_bool, X2_bool, metric)
+def test_cdist_bool_metric(metric, X_bool, Y_bool):
+    D_true = cdist(X_bool, Y_bool, metric)
     check_cdist_bool(metric, D_true)
 
 
 def check_cdist_bool(metric, D_true):
     dm = DistanceMetric.get_metric(metric)
-    D12 = dm.pairwise(X1_bool, X2_bool)
+    D12 = dm.pairwise(X_bool, Y_bool)
     assert_array_almost_equal(D12, D_true)
 
 
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
-@pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)])
-def test_pdist(metric_param_grid, X1, X2):
+@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)])
+def test_pdist(metric_param_grid, X, Y):
     metric, param_grid = metric_param_grid
     keys = param_grid.keys()
     for vals in itertools.product(*param_grid.values()):
@@ -135,29 +137,29 @@ def test_pdist(metric_param_grid, X1, X2):
             if sp_version >= parse_version("1.6.0"):
                 ExceptionToAssert = DeprecationWarning
             with pytest.warns(ExceptionToAssert):
-                D_true = cdist(X1, X1, metric, **kwargs)
+                D_true = cdist(X, X, metric, **kwargs)
         else:
-            D_true = cdist(X1, X1, metric, **kwargs)
+            D_true = cdist(X, X, metric, **kwargs)
 
         check_pdist(metric, kwargs, D_true)
 
 
 @pytest.mark.parametrize("metric", BOOL_METRICS)
-@pytest.mark.parametrize("X1_bool", [X1_bool, X1_bool_mmap])
-def test_pdist_bool_metrics(metric, X1_bool):
-    D_true = cdist(X1_bool, X1_bool, metric)
+@pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap])
+def test_pdist_bool_metrics(metric, X_bool):
+    D_true = cdist(X_bool, X_bool, metric)
     check_pdist_bool(metric, D_true)
 
 
 def check_pdist(metric, kwargs, D_true):
     dm = DistanceMetric.get_metric(metric, **kwargs)
-    D12 = dm.pairwise(X1)
+    D12 = dm.pairwise(X64)
     assert_array_almost_equal(D12, D_true)
 
 
 def check_pdist_bool(metric, D_true):
     dm = DistanceMetric.get_metric(metric)
-    D12 = dm.pairwise(X1_bool)
+    D12 = dm.pairwise(X_bool)
     # Based on https://github.com/scipy/scipy/pull/7373
     # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
     # was changed to return 0, instead of nan.
@@ -186,20 +188,20 @@ def test_pickle(writable_kwargs, metric_param_grid):
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("metric", BOOL_METRICS)
-@pytest.mark.parametrize("X1_bool", [X1_bool, X1_bool_mmap])
-def test_pickle_bool_metrics(metric, X1_bool):
+@pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap])
+def test_pickle_bool_metrics(metric, X_bool):
     dm = DistanceMetric.get_metric(metric)
-    D1 = dm.pairwise(X1_bool)
+    D1 = dm.pairwise(X_bool)
     dm2 = pickle.loads(pickle.dumps(dm))
-    D2 = dm2.pairwise(X1_bool)
+    D2 = dm2.pairwise(X_bool)
     assert_array_almost_equal(D1, D2)
 
 
 def check_pickle(metric, kwargs):
     dm = DistanceMetric.get_metric(metric, **kwargs)
-    D1 = dm.pairwise(X1)
+    D1 = dm.pairwise(X64)
     dm2 = pickle.loads(pickle.dumps(dm))
-    D2 = dm2.pairwise(X1)
+    D2 = dm2.pairwise(X64)
     assert_array_almost_equal(D1, D2)
 
 
@@ -305,11 +307,11 @@ def test_minkowski_metric_validate_weights_size():
     dm = DistanceMetric.get_metric("minkowski", p=3, w=w2)
     msg = (
         "MinkowskiDistance: the size of w must match "
-        f"the number of features \\({X1.shape[1]}\\). "
+        f"the number of features \\({X64.shape[1]}\\). "
         f"Currently len\\(w\\)={w2.shape[0]}."
     )
     with pytest.raises(ValueError, match=msg):
-        dm.pairwise(X1, X2)
+        dm.pairwise(X64, Y64)
 
 
 # TODO: Remove in 1.3 when wminkowski is removed
@@ -328,6 +330,6 @@ def test_wminkowski_minkowski_equivalence(p):
     # Weights are rescaled for consistency w.r.t scipy 1.8 refactoring of 'minkowski'
     dm_wmks = DistanceMetric.get_metric("wminkowski", p=p, w=(w) ** (1 / p))
     dm_mks = DistanceMetric.get_metric("minkowski", p=p, w=w)
-    D_wmks = dm_wmks.pairwise(X1, X2)
-    D_mks = dm_mks.pairwise(X1, X2)
+    D_wmks = dm_wmks.pairwise(X64, Y64)
+    D_mks = dm_mks.pairwise(X64, Y64)
     assert_array_almost_equal(D_wmks, D_mks)

From eb0c65e98e40e62b0db25c5e85a5c71f18073842 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 25 Feb 2022 16:06:11 +0100
Subject: [PATCH 08/26] fixup! TST Adapt DistanceMetrics tests for 32bit

---
 sklearn/metrics/_dist_metrics.pyx.tp       | 14 ++--
 sklearn/metrics/tests/test_dist_metrics.py | 78 ++++++++++------------
 2 files changed, 43 insertions(+), 49 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp
index f3689a9adb767..b2fc047d570d0 100644
--- a/sklearn/metrics/_dist_metrics.pyx.tp
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -434,17 +434,17 @@ cdef class DistanceMetric{{name_suffix}}:
         cdef np.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Yarr
         cdef np.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Darr
 
-        Xarr = np.asarray(X, dtype=DTYPE, order='C')
+        Xarr = np.asarray(X, dtype={{DTYPE}}, order='C')
         self._validate_data(Xarr)
         if Y is None:
             Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]),
-                         dtype=DTYPE, order='C')
+                         dtype={{DTYPE}}, order='C')
             self.pdist(Xarr, Darr)
         else:
-            Yarr = np.asarray(Y, dtype=DTYPE, order='C')
+            Yarr = np.asarray(Y, dtype={{DTYPE}}, order='C')
             self._validate_data(Yarr)
             Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]),
-                         dtype=DTYPE, order='C')
+                         dtype={{DTYPE}}, order='C')
             self.cdist(Xarr, Yarr, Darr)
         return Darr
 
@@ -617,7 +617,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         self.p = p
         if w is not None:
             w_array = check_array(
-                w, ensure_2d=False, dtype=DTYPE, input_name="w"
+                w, ensure_2d=False, dtype={{DTYPE}}, input_name="w"
             )
             if (w_array < 0).any():
                 raise ValueError("w cannot contain negative weights")
@@ -759,12 +759,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         if VI.ndim != 2 or VI.shape[0] != VI.shape[1]:
             raise ValueError("V/VI must be square")
 
-        self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype=float, order='C'))
+        self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype={{DTYPE}}, order='C'))
 
         self.size = self.mat.shape[0]
 
         # we need vec as a work buffer
-        self.vec = np.zeros(self.size, dtype=DTYPE)
+        self.vec = np.zeros(self.size, dtype={{DTYPE}})
 
     def _validate_data(self, X):
         if X.shape[1] != self.size:
diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
index 2bf6b1f4ff259..f6bab12e0efcd 100644
--- a/sklearn/metrics/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -10,6 +10,9 @@
 import scipy.sparse as sp
 from scipy.spatial.distance import cdist
 from sklearn.metrics import DistanceMetric
+
+# Private DistanceMetric for 32 bit
+from sklearn.metrics._dist_metrics import DistanceMetric32
 from sklearn.metrics._dist_metrics import BOOL_METRICS
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import create_memmap_backed_data
@@ -67,27 +70,14 @@ def dist_func(x1, x2, p):
     )
 
 
-def check_cdist(metric, kwargs, X, Y):
-    if metric == "wminkowski":
-        # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
-        WarningToExpect = None
-        if sp_version >= parse_version("1.6.0"):
-            WarningToExpect = DeprecationWarning
-        with pytest.warns(WarningToExpect):
-            D_scipy_cdist = cdist(X, Y, metric, **kwargs)
-    else:
-        D_scipy_cdist = cdist(X, Y, metric, **kwargs)
-
-    dm = DistanceMetric.get_metric(metric, **kwargs)
-    D_sklearn = dm.pairwise(X, Y)
-    assert_array_almost_equal(D_sklearn, D_scipy_cdist)
-
-
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
 @pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)])
 def test_cdist(metric_param_grid, X, Y):
+    DistanceMetricInterface = (
+        DistanceMetric if X.dtype == Y.dtype == np.float64 else DistanceMetric32
+    )
     metric, param_grid = metric_param_grid
     keys = param_grid.keys()
     for vals in itertools.product(*param_grid.values()):
@@ -98,7 +88,20 @@ def test_cdist(metric_param_grid, X, Y):
             pytest.xfail(
                 "scipy#13861: cdist with 'mahalanobis' fails on joblib memmap data"
             )
-        check_cdist(metric, kwargs, X, Y)
+
+        if metric == "wminkowski":
+            # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
+            WarningToExpect = None
+            if sp_version >= parse_version("1.6.0"):
+                WarningToExpect = DeprecationWarning
+            with pytest.warns(WarningToExpect):
+                D_scipy_cdist = cdist(X, Y, metric, **kwargs)
+        else:
+            D_scipy_cdist = cdist(X, Y, metric, **kwargs)
+
+        dm = DistanceMetricInterface.get_metric(metric, **kwargs)
+        D_sklearn = dm.pairwise(X, Y)
+        assert_array_almost_equal(D_sklearn, D_scipy_cdist)
 
 
 @pytest.mark.parametrize("metric", BOOL_METRICS)
@@ -107,10 +110,6 @@ def test_cdist(metric_param_grid, X, Y):
 )
 def test_cdist_bool_metric(metric, X_bool, Y_bool):
     D_true = cdist(X_bool, Y_bool, metric)
-    check_cdist_bool(metric, D_true)
-
-
-def check_cdist_bool(metric, D_true):
     dm = DistanceMetric.get_metric(metric)
     D12 = dm.pairwise(X_bool, Y_bool)
     assert_array_almost_equal(D12, D_true)
@@ -121,6 +120,9 @@ def check_cdist_bool(metric, D_true):
 @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
 @pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)])
 def test_pdist(metric_param_grid, X, Y):
+    DistanceMetricInterface = (
+        DistanceMetric if X.dtype == Y.dtype == np.float64 else DistanceMetric32
+    )
     metric, param_grid = metric_param_grid
     keys = param_grid.keys()
     for vals in itertools.product(*param_grid.values()):
@@ -141,23 +143,15 @@ def test_pdist(metric_param_grid, X, Y):
         else:
             D_true = cdist(X, X, metric, **kwargs)
 
-        check_pdist(metric, kwargs, D_true)
+        dm = DistanceMetricInterface.get_metric(metric, **kwargs)
+        D12 = dm.pairwise(X)
+        assert_array_almost_equal(D12, D_true)
 
 
 @pytest.mark.parametrize("metric", BOOL_METRICS)
 @pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap])
 def test_pdist_bool_metrics(metric, X_bool):
     D_true = cdist(X_bool, X_bool, metric)
-    check_pdist_bool(metric, D_true)
-
-
-def check_pdist(metric, kwargs, D_true):
-    dm = DistanceMetric.get_metric(metric, **kwargs)
-    D12 = dm.pairwise(X64)
-    assert_array_almost_equal(D12, D_true)
-
-
-def check_pdist_bool(metric, D_true):
     dm = DistanceMetric.get_metric(metric)
     D12 = dm.pairwise(X_bool)
     # Based on https://github.com/scipy/scipy/pull/7373
@@ -172,7 +166,11 @@ def check_pdist_bool(metric, D_true):
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("writable_kwargs", [True, False])
 @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
-def test_pickle(writable_kwargs, metric_param_grid):
+@pytest.mark.parametrize("X", [X64, X32])
+def test_pickle(writable_kwargs, metric_param_grid, X):
+    DistanceMetricInterface = (
+        DistanceMetric if X.dtype == np.float64 else DistanceMetric32
+    )
     metric, param_grid = metric_param_grid
     keys = param_grid.keys()
     for vals in itertools.product(*param_grid.values()):
@@ -182,7 +180,11 @@ def test_pickle(writable_kwargs, metric_param_grid):
                 if isinstance(val, np.ndarray):
                     val.setflags(write=writable_kwargs)
         kwargs = dict(zip(keys, vals))
-        check_pickle(metric, kwargs)
+        dm = DistanceMetricInterface.get_metric(metric, **kwargs)
+        D1 = dm.pairwise(X)
+        dm2 = pickle.loads(pickle.dumps(dm))
+        D2 = dm2.pairwise(X)
+        assert_array_almost_equal(D1, D2)
 
 
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@@ -197,14 +199,6 @@ def test_pickle_bool_metrics(metric, X_bool):
     assert_array_almost_equal(D1, D2)
 
 
-def check_pickle(metric, kwargs):
-    dm = DistanceMetric.get_metric(metric, **kwargs)
-    D1 = dm.pairwise(X64)
-    dm2 = pickle.loads(pickle.dumps(dm))
-    D2 = dm2.pairwise(X64)
-    assert_array_almost_equal(D1, D2)
-
-
 def test_haversine_metric():
     def haversine_slow(x1, x2):
         return 2 * np.arcsin(

From 42eba725dda7f81a0c790a5ee073985bc0ed2ea2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 25 Feb 2022 15:22:18 +0100
Subject: [PATCH 09/26] MAINT Upcast buffers to 64bit when and where needed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tempita is a funny preprocessor. 🤙
---
 sklearn/metrics/_dist_metrics.pxd.tp          |  16 +-
 sklearn/metrics/_dist_metrics.pyx.tp          |  17 +-
 .../_pairwise_distances_reduction.pyx.tp      | 186 ++++++++++++++----
 3 files changed, 179 insertions(+), 40 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp
index ca81e978ba154..752d794e5c168 100644
--- a/sklearn/metrics/_dist_metrics.pxd.tp
+++ b/sklearn/metrics/_dist_metrics.pxd.tp
@@ -5,8 +5,20 @@ implementation_specific_values = [
     #
     #       name_suffix, bitness, DTYPE_t, DTYPE
     #
-    # We use an empty string as to still be able to expose the same
-    # API for the reference 64bit implementations.
+    # On the first hand, an empty string is use for `name_suffix`
+    # for the 64bit case as to still be able to expose the original
+    # 64bit implementation under the same API, namely `DistanceMetric`.
+    #
+    # On the other hand, '32' bit is use for `name_suffix`
+    # for the 32bit case to remove ambiguity and use `DistanceMetric32`,
+    # which is not publicly exposed.
+    #
+    # The metric mapping is adapted accordingly to route to the correct
+    # implementations.
+    #
+    # We also use 64bit types as defined in `sklearn.utils._typedefs`
+    # to maintain backward compatibility as the symbol level for extra
+    # safety.
     #
     ('', '64', 'DTYPE_t', 'DTYPE'),
     ('32', '32', 'np.float32_t', 'np.float32')
diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp
index b2fc047d570d0..48f11a80726f4 100644
--- a/sklearn/metrics/_dist_metrics.pyx.tp
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -5,8 +5,21 @@ implementation_specific_values = [
     #
     #       name_suffix, bitness, DTYPE_t, DTYPE
     #
-    # We use an empty string as to still be able to expose the same
-    # API for the reference 64bit implementations.
+    #
+    # On the first hand, an empty string is use for `name_suffix`
+    # for the 64bit case as to still be able to expose the original
+    # 64bit implementation under the same API, namely `DistanceMetric`.
+    #
+    # On the other hand, '32' bit is use for `name_suffix`
+    # for the 32bit case to remove ambiguity and use `DistanceMetric32`,
+    # which is not publicly exposed.
+    #
+    # The metric mapping is adapted accordingly to route to the correct
+    # implementations.
+    #
+    # We also use 64bit types as defined in `sklearn.utils._typedefs`
+    # to maintain backward compatibility as the symbol level for extra
+    # safety.
     #
     ('', '64', 'DTYPE_t', 'DTYPE'),
     ('32', '32', 'np.float32_t', 'np.float32')
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index 2803e65416ab8..fa4499cecbe7b 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -3,13 +3,25 @@
 implementation_specific_values = [
     # Values are the following ones:
     #
-    #       name_suffix, bitness, DTYPE_t, DTYPE
+    #       name_suffix, bitness, DTYPE_t, DTYPE, need_upcast
     #
-    # We use an empty string as to still be able to expose the same
-    # API for the reference 64bit implementations.
+    # On the first hand, an empty string is use for `name_suffix`
+    # for the 64bit case as to still be able to expose the original
+    # 64bit implementation under the same API, namely `DistanceMetric`.
     #
-    ('', '64', 'DTYPE_t', 'DTYPE'),
-    ('32', '32', 'np.float32_t', 'np.float32')
+    # On the other hand, '32' bit is use for `name_suffix`
+    # for the 32bit case to remove ambiguity and use `DistanceMetric32`,
+    # which is not publicly exposed.
+    #
+    # The metric mapping is adapted accordingly to route to the correct
+    # implementations.
+    #
+    # We also use 64bit types as defined in `sklearn.utils._typedefs`
+    # to maintain backward compatibility as the symbol level for extra
+    # safety.
+    #
+    ('', '64', 'DTYPE_t', 'DTYPE', False),
+    ('32', '32', 'np.float32_t', 'np.float32', True)
 ]
 
 }}
@@ -36,6 +48,7 @@ from .. import get_config
 from libc.stdlib cimport free, malloc
 from libc.float cimport DBL_MAX
 from cython cimport final
+from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
 
 from ..utils._cython_blas cimport (
@@ -319,12 +332,33 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         )
 
 
-{{for distance_suffix, bitness, DTYPE_t, DTYPE in implementation_specific_values}}
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
+    const DTYPE_t[:, ::1] X,
+    ITYPE_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
 
-from ._dist_metrics cimport DatasetsPair{{distance_suffix}}, DenseDenseDatasetsPair{{distance_suffix}}
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        # See: https://github.com/scipy/scipy/issues/14262
+        DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
+        ITYPE_t i = 0
+        ITYPE_t n = X.shape[0]
+        ITYPE_t d = X.shape[1]
+        DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
+
+    for i in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        squared_row_norms[i] = _dot(d, X_ptr + i * d, 1, X_ptr + i * d, 1)
 
-cpdef {{DTYPE_t}}[::1] _sqeuclidean_row_norms{{bitness}}(
-    const {{DTYPE_t}}[:, ::1] X,
+    return squared_row_norms
+
+
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms32(
+    const np.float32_t[:, ::1] X,
     ITYPE_t num_threads,
 ):
     """Compute the squared euclidean norm of the rows of X in parallel.
@@ -336,18 +370,36 @@ cpdef {{DTYPE_t}}[::1] _sqeuclidean_row_norms{{bitness}}(
         # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
         # const qualifier.
         # See: https://github.com/scipy/scipy/issues/14262
-        {{DTYPE_t}} * X_ptr = <{{DTYPE_t}} *> &X[0, 0]
-        ITYPE_t idx = 0
+        np.float32_t * X_ptr = <np.float32_t *> &X[0, 0]
+        ITYPE_t i = 0, j = 0
         ITYPE_t n = X.shape[0]
         ITYPE_t d = X.shape[1]
-        {{DTYPE_t}}[::1] squared_row_norms = np.empty(n, dtype={{DTYPE}})
+        DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
+
+        # To upcast the i-th row of X from 32bit to 64bit
+        DTYPE_t * X_idx_upcast_ptr
+
+    with nogil, parallel(num_threads=num_threads):
+        # Thread-local buffer allocation
+        X_i_upcast_ptr = <DTYPE_t* > malloc(sizeof(DTYPE_t) * d)
+        for i in prange(n, schedule='static'):
+
+            # Upcasting the i-th row of X from 32bit to 64bit
+            for j in range(d):
+                X_i_upcast_ptr[j] = <DTYPE_t> deref(X_ptr + i * d + j)
 
-    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
-        squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
+            squared_row_norms[i] = _dot(d, X_i_upcast_ptr, 1, X_i_upcast_ptr, 1)
+
+        free(X_i_upcast_ptr)
 
     return squared_row_norms
 
 
+{{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}}
+
+from ._dist_metrics cimport DatasetsPair{{distance_suffix}}, DenseDenseDatasetsPair{{distance_suffix}}
+
+
 cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
     """{{bitness}}bit implementation of PairwiseDistancesReduction."""
 
@@ -770,12 +822,6 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
             metric in ("euclidean", "sqeuclidean")
             and not issparse(X)
             and not issparse(Y)
-            # As of now, we do not route the 32bit case to the specialized
-            # implementation as defining the upcast fixtures for GEMM using
-            # Tempita is non-trivial
-            # TODO: find a way to define the upcasting fixtures with Tempita
-            # for the FastEuclideanPairwiseDistancesArgKmin32.
-            and {{bitness}} == 64
         ):
             # Specialized implementation with improved arithmetic intensity
             # and vector instructions (SIMD) by processing several vectors
@@ -1038,13 +1084,20 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
     cdef:
         const {{DTYPE_t}}[:, ::1] X
         const {{DTYPE_t}}[:, ::1] Y
-        const {{DTYPE_t}}[::1] X_norm_squared
-        const {{DTYPE_t}}[::1] Y_norm_squared
+        const DTYPE_t[::1] X_norm_squared
+        const DTYPE_t[::1] Y_norm_squared
 
         # Buffers for GEMM
-        {{DTYPE_t}} ** dist_middle_terms_chunks
+        DTYPE_t ** dist_middle_terms_chunks
         bint use_squared_distances
 
+{{if need_upcast}}
+        # Buffers for upcasting chunks of X and Y
+        # from 32bit to 64bit.
+        DTYPE_t ** X_c_upcast
+        DTYPE_t ** Y_c_upcast
+{{endif}}
+
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
         return (PairwiseDistancesArgKmin{{bitness}}.is_usable_for(X, Y, metric) and
@@ -1095,13 +1148,30 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         self.use_squared_distances = use_squared_distances
 
         # Temporary datastructures used in threads
-        self.dist_middle_terms_chunks = <{{DTYPE_t}} **> malloc(
-            sizeof({{DTYPE_t}} *) * self.chunks_n_threads
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.chunks_n_threads
+        )
+
+{{if need_upcast}}
+        # Buffers for upcasting chunks of X and Y
+        # from 32bit to 64bit.
+        self.X_c_upcast = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.chunks_n_threads
         )
+        self.Y_c_upcast = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.chunks_n_threads
+        )
+{{endif}}
 
     def __dealloc__(self):
         if self.dist_middle_terms_chunks is not NULL:
             free(self.dist_middle_terms_chunks)
+{{if need_upcast}}
+        if self.X_c_upcast is not NULL:
+            free(self.X_c_upcast)
+        if self.Y_c_upcast is not NULL:
+            free(self.Y_c_upcast)
+{{endif}}
 
     @final
     cdef void compute_exact_distances(self) nogil:
@@ -1116,9 +1186,19 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_parallel_init(self, thread_num)
 
         # Temporary buffer for the `-2 * X_c @ Y_c.T` term
-        self.dist_middle_terms_chunks[thread_num] = <{{DTYPE_t}} *> malloc(
-            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof({{DTYPE_t}})
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+        )
+{{if need_upcast}}
+        # Buffers for upcasting chunks of X and Y
+        # from 32bit to 64bit.
+        self.X_c_upcast[thread_num] = <DTYPE_t *> malloc(
+            self.X_n_samples_chunk * self.X.shape[1] * sizeof(DTYPE_t)
+        )
+        self.Y_c_upcast[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.Y.shape[1] * sizeof(DTYPE_t)
         )
+{{endif}}
 
     @final
     cdef void _parallel_on_X_parallel_finalize(
@@ -1127,6 +1207,10 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
     ) nogil:
         PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_parallel_finalize(self, thread_num)
         free(self.dist_middle_terms_chunks[thread_num])
+{{if need_upcast}}
+        free(self.X_c_upcast[thread_num])
+        free(self.Y_c_upcast[thread_num])
+{{endif}}
 
     @final
     cdef void _parallel_on_Y_init(
@@ -1137,9 +1221,19 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
 
         for thread_num in range(self.chunks_n_threads):
             # Temporary buffer for the `-2 * X_c @ Y_c.T` term
-            self.dist_middle_terms_chunks[thread_num] = <{{DTYPE_t}} *> malloc(
-                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof({{DTYPE_t}})
+            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
             )
+{{if need_upcast}}
+            # Buffers for upcasting chunks of X and Y
+            # from 32bit to 64bit.
+            self.X_c_upcast[thread_num] = <DTYPE_t *> malloc(
+                self.X_n_samples_chunk * self.X.shape[1] * sizeof(DTYPE_t)
+            )
+            self.Y_c_upcast[thread_num] = <DTYPE_t *> malloc(
+                self.Y_n_samples_chunk * self.Y.shape[1] * sizeof(DTYPE_t)
+            )
+{{endif}}
 
     @final
     cdef void _parallel_on_Y_finalize(
@@ -1150,6 +1244,10 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
 
         for thread_num in range(self.chunks_n_threads):
             free(self.dist_middle_terms_chunks[thread_num])
+{{if need_upcast}}
+            free(self.X_c_upcast[thread_num])
+            free(self.Y_c_upcast[thread_num])
+{{endif}}
 
     @final
     cdef void _compute_and_reduce_distances_on_chunks(
@@ -1165,7 +1263,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
 
             const {{DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :]
             const {{DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
-            {{DTYPE_t}} *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             {{DTYPE_t}} *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
@@ -1181,15 +1279,31 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
             ITYPE_t n = Y_c.shape[0]
             ITYPE_t K = X_c.shape[1]
             {{DTYPE_t}} alpha = - 2.
-            # Casting for A and B to remove the const is needed because APIs exposed via
-            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
-            # See: https://github.com/scipy/scipy/issues/14262
-            {{DTYPE_t}} * A = <{{DTYPE_t}}*> & X_c[0, 0]
             ITYPE_t lda = X_c.shape[1]
-            {{DTYPE_t}} * B = <{{DTYPE_t}}*> & Y_c[0, 0]
             ITYPE_t ldb = X_c.shape[1]
             {{DTYPE_t}} beta = 0.
             ITYPE_t ldc = Y_c.shape[0]
+{{if need_upcast}}
+            DTYPE_t * A = self.X_c_upcast[thread_num]
+            DTYPE_t * B = self.Y_c_upcast[thread_num]
+
+        # Upcasting X_c from float32 to float64
+        for i in range(m):
+            for j in range(lda):
+                A[i * lda + j] = <DTYPE_t> X_c[i, j]
+
+        # Upcasting Y_c from float32 to float64
+        for i in range(n):
+            for j in range(lda):
+                B[i * lda + j] = <DTYPE_t> Y_c[i, j]
+
+{{else}}
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            # See: https://github.com/scipy/scipy/issues/14262
+            DTYPE_t * A = <{{DTYPE_t}} *> &X_c[0, 0]
+            DTYPE_t * B = <{{DTYPE_t}} *> &Y_c[0, 0]
+{{endif}}
 
         # dist_middle_terms = `-2 * X_c @ Y_c.T`
         _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
@@ -1206,7 +1320,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
                     #
                     #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                     #
-                    (
+                    <{{DTYPE_t}}> (
                         self.X_norm_squared[i + X_start] +
                         dist_middle_terms[i * Y_c.shape[0] + j] +
                         self.Y_norm_squared[j + Y_start]

From b41c8aa6a1e8ddb3989fce5ee07747008c0750a0 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Sat, 26 Feb 2022 12:38:00 +0100
Subject: [PATCH 10/26] MAINT Improve upcast

Previously, upcast was done in the critical region.
This causes an unneeded upcast for one of the buffers.

This only upcasts buffers when necessary and where
necessary without duplication contrarily to previously.

Two methods are introduced to perform this upcast
for each strategy.

Yet, this adds some complexity to the templating.
---
 .../_pairwise_distances_reduction.pyx.tp      | 168 +++++++++++++++---
 1 file changed, 146 insertions(+), 22 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index fa4499cecbe7b..b15191d851661 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -527,7 +527,7 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
                     X_end = X_start + self.X_n_samples_chunk
 
                 # Reinitializing thread datastructures for the new X chunk
-                self._parallel_on_X_init_chunk(thread_num, X_start)
+                self._parallel_on_X_init_chunk(thread_num, X_start, X_end)
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -536,6 +536,12 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
+                    self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
                     self._compute_and_reduce_distances_on_chunks(
                         X_start, X_end,
                         Y_start, Y_end,
@@ -588,7 +594,7 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
                 thread_num = _openmp_thread_num()
 
                 # Initializing datastructures used in this thread
-                self._parallel_on_Y_parallel_init(thread_num)
+                self._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -597,6 +603,12 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
+                    self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
                     self._compute_and_reduce_distances_on_chunks(
                         X_start, X_end,
                         Y_start, Y_end,
@@ -630,7 +642,8 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
         """Compute the pairwise distances on two chunks of X and Y and reduce them.
 
         This is THE core computational method of PairwiseDistanceReductions{{bitness}}.
-        This must be implemented in subclasses.
+        This must be implemented in subclasses agnostically from the parallelisation
+        strategies.
         """
         return
 
@@ -658,10 +671,22 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         """Initialise datastructures used in a thread given its number."""
         return
 
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks."""
+        return
+
     cdef void _parallel_on_X_prange_iter_finalize(
         self,
         ITYPE_t thread_num,
@@ -687,10 +712,23 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
     cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         """Initialise datastructures used in a thread given its number."""
         return
 
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks."""
+        return
+
     cdef void _parallel_on_Y_synchronize(
         self,
         ITYPE_t X_start,
@@ -923,11 +961,15 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
                     Y_start + j,
                 )
 
+{{if need_upcast}}
+{{else}}
     @final
+{{endif}}
     cdef void _parallel_on_X_init_chunk(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         # As this strategy is embarrassingly parallel, we can set each
         # thread's heaps pointer to the proper position on the main heaps.
@@ -978,10 +1020,15 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
                 heaps_size * sizeof(ITYPE_t)
             )
 
+{{if need_upcast}}
+{{else}}
     @final
+{{endif}}
     cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         # Initialising heaps (memset can't be used here)
         for idx in range(self.X_n_samples_chunk * self.k):
@@ -1092,8 +1139,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         bint use_squared_distances
 
 {{if need_upcast}}
-        # Buffers for upcasting chunks of X and Y
-        # from 32bit to 64bit.
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
         DTYPE_t ** X_c_upcast
         DTYPE_t ** Y_c_upcast
 {{endif}}
@@ -1153,8 +1199,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         )
 
 {{if need_upcast}}
-        # Buffers for upcasting chunks of X and Y
-        # from 32bit to 64bit.
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
         self.X_c_upcast = <DTYPE_t **> malloc(
             sizeof(DTYPE_t *) * self.chunks_n_threads
         )
@@ -1190,8 +1235,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
             self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
         )
 {{if need_upcast}}
-        # Buffers for upcasting chunks of X and Y
-        # from 32bit to 64bit.
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
         self.X_c_upcast[thread_num] = <DTYPE_t *> malloc(
             self.X_n_samples_chunk * self.X.shape[1] * sizeof(DTYPE_t)
         )
@@ -1200,6 +1244,51 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         )
 {{endif}}
 
+{{if need_upcast}}
+    @final
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t n_features = self.X.shape[1]
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(n_features):
+                self.X_c_upcast[thread_num][i * n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+
+    @final
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t n_features = self.Y.shape[1]
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(n_features):
+                self.Y_c_upcast[thread_num][i * n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+{{endif}}
+
     @final
     cdef void _parallel_on_X_parallel_finalize(
         self,
@@ -1225,8 +1314,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
                 self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
             )
 {{if need_upcast}}
-            # Buffers for upcasting chunks of X and Y
-            # from 32bit to 64bit.
+            # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
             self.X_c_upcast[thread_num] = <DTYPE_t *> malloc(
                 self.X_n_samples_chunk * self.X.shape[1] * sizeof(DTYPE_t)
             )
@@ -1235,6 +1323,52 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
             )
 {{endif}}
 
+{{if need_upcast}}
+    @final
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t n_features = self.X.shape[1]
+            ITYPE_t n_chunk_samples = X_end - X_start
+            ITYPE_t i, j
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(n_features):
+                self.X_c_upcast[thread_num][i * n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+
+    @final
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t n_features = self.Y.shape[1]
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(n_features):
+                self.Y_c_upcast[thread_num][i * n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+{{endif}}
+
+
     @final
     cdef void _parallel_on_Y_finalize(
         self,
@@ -1284,19 +1418,9 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
             {{DTYPE_t}} beta = 0.
             ITYPE_t ldc = Y_c.shape[0]
 {{if need_upcast}}
+            # Those two buffers have been upcast from 32bit to 64bit previously.
             DTYPE_t * A = self.X_c_upcast[thread_num]
             DTYPE_t * B = self.Y_c_upcast[thread_num]
-
-        # Upcasting X_c from float32 to float64
-        for i in range(m):
-            for j in range(lda):
-                A[i * lda + j] = <DTYPE_t> X_c[i, j]
-
-        # Upcasting Y_c from float32 to float64
-        for i in range(n):
-            for j in range(lda):
-                B[i * lda + j] = <DTYPE_t> Y_c[i, j]
-
 {{else}}
             # Casting for A and B to remove the const is needed because APIs exposed via
             # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.

From 65ebc927fb4954c20f68b5bdf6b9a200e89388c2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Sat, 26 Feb 2022 12:50:37 +0100
Subject: [PATCH 11/26] CLN Improve imports and fix duplicated ignored files

---
 .gitignore                                 | 1 -
 sklearn/metrics/tests/test_dist_metrics.py | 9 ++++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index c3e28a9cfc170..967ce97dc38ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,7 +85,6 @@ sklearn/utils/_seq_dataset.pxd
 sklearn/utils/_weight_vector.pyx
 sklearn/utils/_weight_vector.pxd
 sklearn/linear_model/_sag_fast.pyx
-sklearn/metrics/_weight_vector.pyx
 sklearn/metrics/_dist_metrics.pyx
 sklearn/metrics/_dist_metrics.pxd
 sklearn/metrics/_pairwise_distances_reduction.pyx
diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
index f6bab12e0efcd..1a24afe2b2b30 100644
--- a/sklearn/metrics/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -11,9 +11,12 @@
 from scipy.spatial.distance import cdist
 from sklearn.metrics import DistanceMetric
 
-# Private DistanceMetric for 32 bit
-from sklearn.metrics._dist_metrics import DistanceMetric32
-from sklearn.metrics._dist_metrics import BOOL_METRICS
+from sklearn.metrics._dist_metrics import (
+    BOOL_METRICS,
+    # Unexposed private DistanceMetric for 32 bit
+    DistanceMetric32,
+)
+
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import create_memmap_backed_data
 from sklearn.utils.fixes import sp_version, parse_version

From ffb08acec26b0f46e6056d3efda6cd624e37e177 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 28 Feb 2022 09:24:13 +0100
Subject: [PATCH 12/26] MAINT Do not warn if Y_norm_squared is passed via
 metric_kwargs

---
 sklearn/metrics/_pairwise_distances_reduction.pyx.tp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index b15191d851661..794a86c13d0dc 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -1159,7 +1159,11 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         strategy=None,
         metric_kwargs=None,
     ):
-        if metric_kwargs is not None and len(metric_kwargs) > 0:
+        if (
+            metric_kwargs is not None and
+            len(metric_kwargs) > 0 and
+            "Y_norm_squared" not in metric_kwargs
+        ):
             warnings.warn(
                 f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't"
                 f"usable for this case ({self.__class__.__name__}) and will be ignored.",

From 059128fb916f86bf54b5d9fe382cd82d23241c1d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 28 Feb 2022 10:22:25 +0100
Subject: [PATCH 13/26] DOC Update whats_new entry

---
 doc/whats_new/v1.1.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index b0d36364ec333..8bcf0450d32e1 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -73,9 +73,9 @@ Changelog
     where 123456 is the *pull request* number, not the issue number.
 
 - |Efficiency| Low-level routines for reductions on pairwise distances
-  for dense float64 datasets have been refactored. The following functions
-  and estimators now benefit from improved performances, in particular on
-  multi-cores machines:
+  for dense float32 and float64 datasets have been refactored.
+  The following functions and estimators now benefit from improved performances,
+  in particular on multi-cores machines:
   - :func:`sklearn.metrics.pairwise_distances_argmin`
   - :func:`sklearn.metrics.pairwise_distances_argmin_min`
   - :class:`sklearn.cluster.AffinityPropagation`
@@ -98,7 +98,7 @@ Changelog
   For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors`
   can be up to ×20 faster than in the previous versions'.
 
-  :pr:`21987`, :pr:`22064`, :pr:`22065` and :pr:`22288`
+  :pr:`21987`, :pr:`22064`, :pr:`22065`, :pr:`22288` and :pr:`22590`.
   by :user:`Julien Jerphanion <jjerphan>`
 
 - |Enhancement| All scikit-learn models now generate a more informative

From 7ccd37ed9a2d7b70a8a7b58f6d311957d8f856e4 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 28 Feb 2022 11:49:41 +0100
Subject: [PATCH 14/26] TST Add a test for dtype agnosticism

Also remove uneeded parametrisation.
---
 .../test_pairwise_distances_reduction.py      | 48 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 529c2228b5ddf..1e58d2e2609bb 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -266,8 +266,6 @@ def test_chunk_size_agnosticism(
 
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [100, 1000])
-@pytest.mark.parametrize("chunk_size", [50, 512, 1024])
-@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
     [PairwiseDistancesArgKmin],
@@ -310,6 +308,52 @@ def test_n_threads_agnosticism(
     )
 
 
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin],
+)
+def test_dtype_agnosticism(
+    PairwiseDistancesReduction,
+    seed,
+    metric,
+    n_samples=1000,
+    n_features=100,
+):
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X64 = rng.rand(n_samples, n_features).astype(np.float64) * spread
+    Y64 = rng.rand(n_samples, n_features).astype(np.float64) * spread
+    X32 = X64.astype(np.float32)
+    Y32 = Y64.astype(np.float32)
+
+    parameter = (
+        10
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
+        # Scaling the radius slightly with the numbers of dimensions
+        else 10 ** np.log(n_features)
+    )
+
+    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
+        X64,
+        Y64,
+        parameter,
+        return_distance=True,
+    )
+
+    dist, indices = PairwiseDistancesReduction.compute(
+        X32, Y32, parameter, return_distance=True
+    )
+
+    # We check results against np.float32 because we inherently
+    # loose the information from np.float64.
+    dist = dist.astype(ref_dist.dtype)
+    ASSERT_RESULT[(PairwiseDistancesArgKmin, np.float32)](
+        ref_dist, dist, ref_indices, indices
+    )
+
+
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("seed", range(5))

From 7645ba356b2c6232ba2a9052430afe67aef5a0a2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 2 Mar 2022 15:32:09 +0100
Subject: [PATCH 15/26] MAINT Accumulate using float64

---
 sklearn/metrics/_dist_metrics.pxd.tp          |  26 +--
 sklearn/metrics/_dist_metrics.pyx.tp          | 216 ++++++------------
 .../_pairwise_distances_reduction.pyx.tp      |  26 +--
 3 files changed, 101 insertions(+), 167 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp
index 752d794e5c168..ff3e01c66e564 100644
--- a/sklearn/metrics/_dist_metrics.pxd.tp
+++ b/sklearn/metrics/_dist_metrics.pxd.tp
@@ -37,19 +37,19 @@ from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 #
 #  We use these for the default (euclidean) case so that they can be
 #  inlined.  This leads to faster computation for the most common case
-cdef inline {{DTYPE_t}} euclidean_dist{{name_suffix}}(const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+cdef inline DTYPE_t euclidean_dist{{name_suffix}}(const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                                    ITYPE_t size) nogil except -1:
-    cdef {{DTYPE_t}} tmp, d=0
+    cdef DTYPE_t tmp, d=0
     cdef np.intp_t j
     for j in range(size):
-        tmp = x1[j] - x2[j]
+        tmp = <DTYPE_t> (x1[j] - x2[j])
         d += tmp * tmp
     return sqrt(d)
 
 
-cdef inline {{DTYPE_t}} euclidean_rdist{{name_suffix}}(const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+cdef inline DTYPE_t euclidean_rdist{{name_suffix}}(const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                                     ITYPE_t size) nogil except -1:
-    cdef {{DTYPE_t}} tmp, d=0
+    cdef DTYPE_t tmp, d=0
     cdef np.intp_t j
     for j in range(size):
         tmp = x1[j] - x2[j]
@@ -57,11 +57,11 @@ cdef inline {{DTYPE_t}} euclidean_rdist{{name_suffix}}(const {{DTYPE_t}}* x1, co
     return d
 
 
-cdef inline {{DTYPE_t}} euclidean_dist_to_rdist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1:
+cdef inline DTYPE_t euclidean_dist_to_rdist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1:
     return dist * dist
 
 
-cdef inline {{DTYPE_t}} euclidean_rdist_to_dist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1:
+cdef inline DTYPE_t euclidean_rdist_to_dist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1:
     return sqrt(dist)
 
 
@@ -79,10 +79,10 @@ cdef class DistanceMetric{{name_suffix}}:
     cdef object func
     cdef object kwargs
 
-    cdef {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                       ITYPE_t size) nogil except -1
 
-    cdef {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                        ITYPE_t size) nogil except -1
 
     cdef int pdist(self, const {{DTYPE_t}}[:, ::1] X, {{DTYPE_t}}[:, ::1] D) except -1
@@ -90,9 +90,9 @@ cdef class DistanceMetric{{name_suffix}}:
     cdef int cdist(self, const {{DTYPE_t}}[:, ::1] X, const {{DTYPE_t}}[:, ::1] Y,
                    {{DTYPE_t}}[:, ::1] D) except -1
 
-    cdef {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1
+    cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1
 
-    cdef {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1
+    cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1
 
 
 ######################################################################
@@ -104,9 +104,9 @@ cdef class DatasetsPair{{name_suffix}}:
 
     cdef ITYPE_t n_samples_Y(self) nogil
 
-    cdef {{DTYPE_t}} dist(self, ITYPE_t i, ITYPE_t j) nogil
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
 
-    cdef {{DTYPE_t}} surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
 
 
 cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp
index 48f11a80726f4..672e3fce086c4 100644
--- a/sklearn/metrics/_dist_metrics.pyx.tp
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -333,7 +333,7 @@ cdef class DistanceMetric{{name_suffix}}:
         """
         return
 
-    cdef {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                       ITYPE_t size) nogil except -1:
         """Compute the distance between vectors x1 and x2
 
@@ -341,7 +341,7 @@ cdef class DistanceMetric{{name_suffix}}:
         """
         return -999
 
-    cdef {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                        ITYPE_t size) nogil except -1:
         """Compute the rank-preserving surrogate distance between vectors x1 and x2.
 
@@ -374,11 +374,11 @@ cdef class DistanceMetric{{name_suffix}}:
                 D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1])
         return 0
 
-    cdef {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+    cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         """Convert the rank-preserving surrogate distance to the distance"""
         return rdist
 
-    cdef {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+    cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
         """Convert the distance to the rank-preserving surrogate distance"""
         return dist
 
@@ -474,18 +474,18 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     def __init__(self):
         self.p = 2
 
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return euclidean_dist{{name_suffix}}(x1, x2, size)
 
-    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
         return euclidean_rdist{{name_suffix}}(x1, x2, size)
 
-    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         return sqrt(rdist)
 
-    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
         return dist * dist
 
     def rdist_to_dist(self, rdist):
@@ -513,23 +513,23 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         if X.shape[1] != self.size:
             raise ValueError('SEuclidean dist: size of V does not match')
 
-    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        cdef {{DTYPE_t}} tmp, d=0
+        cdef DTYPE_t tmp, d=0
         cdef np.intp_t j
         for j in range(size):
-            tmp = x1[j] - x2[j]
-            d += tmp * tmp / self.vec[j]
+            tmp = <DTYPE_t> (x1[j] - x2[j])
+            d += <DTYPE_t> (tmp * tmp / self.vec[j])
         return d
 
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return sqrt(self.rdist(x1, x2, size))
 
-    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         return sqrt(rdist)
 
-    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
         return dist * dist
 
     def rdist_to_dist(self, rdist):
@@ -551,12 +551,12 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     def __init__(self):
         self.p = 1
 
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        cdef {{DTYPE_t}} d = 0
+        cdef DTYPE_t d = 0
         cdef np.intp_t j
         for j in range(size):
-            d += fabs(x1[j] - x2[j])
+            d += <DTYPE_t> fabs(x1[j] - x2[j])
         return d
 
 
@@ -584,12 +584,12 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     def __init__(self):
         self.p = INF{{name_suffix}}
 
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        cdef {{DTYPE_t}} d = 0
+        cdef DTYPE_t d = 0
         cdef np.intp_t j
         for j in range(size):
-            d = fmax(d, fabs(x1[j] - x2[j]))
+            d = <DTYPE_t> fmax(d, fabs(x1[j] - x2[j]))
         return d
 
 
@@ -646,28 +646,28 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
                              f"the number of features ({X.shape[1]}). "
                              f"Currently len(w)={self.size}.")
 
-    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        cdef {{DTYPE_t}} d=0
+        cdef DTYPE_t d=0
         cdef np.intp_t j
         cdef bint has_w = self.size > 0
         if has_w:
             for j in range(size):
-                d += self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p)
+                d += <DTYPE_t> (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p))
         else:
             for j in range(size):
-                d += pow(fabs(x1[j] - x2[j]), self.p)
+                d += <DTYPE_t> (pow(fabs(x1[j] - x2[j]), self.p))
         return d
 
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        return pow(self.rdist(x1, x2, size), 1. / self.p)
+        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
 
-    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
-        return pow(rdist, 1. / self.p)
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+        return <DTYPE_t> pow(rdist, 1. / self.p)
 
-    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
-        return pow(dist, self.p)
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        return <DTYPE_t> pow(dist, self.p)
 
     def rdist_to_dist(self, rdist):
         return rdist ** (1. / self.p)
@@ -720,23 +720,23 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             raise ValueError('WMinkowskiDistance dist: '
                              'size of w does not match')
 
-    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        cdef {{DTYPE_t}} d=0
+        cdef DTYPE_t d = 0
         cdef np.intp_t j
         for j in range(size):
-            d += pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p)
+            d += <DTYPE_t> (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p))
         return d
 
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        return pow(self.rdist(x1, x2, size), 1. / self.p)
+        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
 
-    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
-        return pow(rdist, 1. / self.p)
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+        return <DTYPE_t> pow(rdist, 1. / self.p)
 
-    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
-        return pow(dist, self.p)
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        return <DTYPE_t> pow(dist, self.p)
 
     def rdist_to_dist(self, rdist):
         return rdist ** (1. / self.p)
@@ -783,9 +783,9 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         if X.shape[1] != self.size:
             raise ValueError('Mahalanobis dist: size of V does not match')
 
-    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        cdef {{DTYPE_t}} tmp, d = 0
+        cdef DTYPE_t tmp, d = 0
         cdef np.intp_t i, j
 
         # compute (x1 - x2).T * VI * (x1 - x2)
@@ -799,14 +799,14 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             d += tmp * self.vec[i]
         return d
 
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return sqrt(self.rdist(x1, x2, size))
 
-    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         return sqrt(rdist)
 
-    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
         return dist * dist
 
     def rdist_to_dist(self, rdist):
@@ -828,7 +828,7 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     .. math::
        D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int n_unequal = 0
         cdef np.intp_t j
@@ -850,14 +850,14 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     .. math::
        D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        cdef {{DTYPE_t}} denom, d = 0
+        cdef DTYPE_t denom, d = 0
         cdef np.intp_t j
         for j in range(size):
-            denom = fabs(x1[j]) + fabs(x2[j])
+            denom = <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
             if denom > 0:
-                d += fabs(x1[j] - x2[j]) / denom
+                d += <DTYPE_t>(fabs(x1[j] - x2[j])) / denom
         return d
 
 
@@ -873,13 +873,13 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     .. math::
        D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
-        cdef {{DTYPE_t}} num = 0, denom = 0
+        cdef DTYPE_t num = 0, denom = 0
         cdef np.intp_t j
         for j in range(size):
-            num += fabs(x1[j] - x2[j])
-            denom += fabs(x1[j]) + fabs(x2[j])
+            num += <DTYPE_t> fabs(x1[j] - x2[j])
+            denom += <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
         if denom > 0:
             return num / denom
         else:
@@ -897,7 +897,7 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     be treated as False.
 
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_eq = 0, nnz = 0
         cdef np.intp_t j
@@ -925,7 +925,7 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     be treated as False.
 
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0
         cdef np.intp_t j
@@ -947,7 +947,7 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     be treated as False.
 
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0, ntt = 0
         cdef np.intp_t j
@@ -970,7 +970,7 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     be treated as False.
 
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, ntt = 0, n_neq = 0
         cdef np.intp_t j
@@ -993,7 +993,7 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     be treated as False.
 
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0
         cdef np.intp_t j
@@ -1015,7 +1015,7 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     be treated as False.
 
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, ntt = 0
         cdef np.intp_t j
@@ -1037,7 +1037,7 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     be treated as False.
 
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0
         cdef np.intp_t j
@@ -1059,7 +1059,7 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     be treated as False.
 
     """
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, ntt = 0, n_neq = 0
         cdef np.intp_t j
@@ -1090,21 +1090,21 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             raise ValueError("Haversine distance only valid "
                              "in 2 dimensions")
 
-    cdef inline {{DTYPE_t}} rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) nogil except -1:
-        cdef {{DTYPE_t}} sin_0 = sin(0.5 * (x1[0] - x2[0]))
-        cdef {{DTYPE_t}} sin_1 = sin(0.5 * (x1[1] - x2[1]))
+        cdef DTYPE_t sin_0 = <DTYPE_t> sin(0.5 * (x1[0] - x2[0]))
+        cdef DTYPE_t sin_1 = <DTYPE_t> sin(0.5 * (x1[1] - x2[1]))
         return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
 
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return 2 * asin(sqrt(self.rdist(x1, x2, size)))
 
-    cdef inline {{DTYPE_t}} _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
         return 2 * asin(sqrt(rdist))
 
-    cdef inline {{DTYPE_t}} _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
-        cdef {{DTYPE_t}} tmp = sin(0.5 * dist)
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        cdef DTYPE_t tmp = <DTYPE_t> sin(0.5 * dist)
         return tmp * tmp
 
     def rdist_to_dist(self, rdist):
@@ -1114,72 +1114,6 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         tmp = np.sin(0.5 * dist)
         return tmp * tmp
 
-
-#------------------------------------------------------------
-# Yule Distance (boolean)
-#  D(x, y) = 2 * ntf * nft / (ntt * nff + ntf * nft)
-# [This is not a true metric, so we will leave it out.]
-#
-#cdef class YuleDistance(DistanceMetric):
-#    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
-#                             ITYPE_t size):
-#        cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0
-#        cdef np.intp_t j
-#        for j in range(size):
-#            tf1 = x1[j] != 0
-#            tf2 = x2[j] != 0
-#            ntt += tf1 and tf2
-#            ntf += tf1 and (tf2 == 0)
-#            nft += (tf1 == 0) and tf2
-#        nff = size - ntt - ntf - nft
-#        return (2.0 * ntf * nft) / (ntt * nff + ntf * nft)
-
-
-#------------------------------------------------------------
-# Cosine Distance
-#  D(x, y) = dot(x, y) / (|x| * |y|)
-# [This is not a true metric, so we will leave it out.]
-#
-#cdef class CosineDistance(DistanceMetric):
-#    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
-#                             ITYPE_t size):
-#        cdef {{DTYPE_t}} d = 0, norm1 = 0, norm2 = 0
-#        cdef np.intp_t j
-#        for j in range(size):
-#            d += x1[j] * x2[j]
-#            norm1 += x1[j] * x1[j]
-#            norm2 += x2[j] * x2[j]
-#        return 1.0 - d / sqrt(norm1 * norm2)
-
-
-#------------------------------------------------------------
-# Correlation Distance
-#  D(x, y) = dot((x - mx), (y - my)) / (|x - mx| * |y - my|)
-# [This is not a true metric, so we will leave it out.]
-#
-#cdef class CorrelationDistance(DistanceMetric{{name_suffix}}):
-#    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
-#                             ITYPE_t size):
-#        cdef {{DTYPE_t}} mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0
-#        cdef {{DTYPE_t}} tmp1, tmp2
-#
-#        cdef np.intp_t i
-#        for i in range(size):
-#            mu1 += x1[i]
-#            mu2 += x2[i]
-#        mu1 /= size
-#        mu2 /= size
-#
-#        for i in range(size):
-#            tmp1 = x1[i] - mu1
-#            tmp2 = x2[i] - mu2
-#            x1nrm += tmp1 * tmp1
-#            x2nrm += tmp2 * tmp2
-#            x1Tx2 += tmp1 * tmp2
-#
-#        return (1. - x1Tx2) / sqrt(x1nrm * x2nrm)
-
-
 #------------------------------------------------------------
 # User-defined distance
 #
@@ -1202,11 +1136,11 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
     # only way to be back compatible is to inherit `dist` from the base class
     # without GIL and called an inline `_dist` which acquire GIL.
-    cdef inline {{DTYPE_t}} dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
         return self._dist(x1, x2, size)
 
-    cdef inline {{DTYPE_t}} _dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+    cdef inline DTYPE_t _dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                               ITYPE_t size) except -1 with gil:
         cdef np.ndarray x1arr
         cdef np.ndarray x2arr
@@ -1329,10 +1263,10 @@ cdef class DatasetsPair{{name_suffix}}:
         # TODO: add "with gil: raise" here when supporting Cython 3.0
         return -999
 
-    cdef {{DTYPE_t}} surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.dist(i, j)
 
-    cdef {{DTYPE_t}} dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
         # This is a abstract method.
         # This _must_ always be overwritten in subclasses.
         # TODO: add "with gil: raise" here when supporting Cython 3.0
@@ -1371,13 +1305,13 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         return self.Y.shape[0]
 
     @final
-    cdef {{DTYPE_t}} surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.distance_metric.rdist(&self.X[i, 0],
                                           &self.Y[j, 0],
                                           self.d)
 
     @final
-    cdef {{DTYPE_t}} dist(self, ITYPE_t i, ITYPE_t j) nogil:
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.distance_metric.dist(&self.X[i, 0],
                                          &self.Y[j, 0],
                                          self.d)
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index 794a86c13d0dc..196e935f908c5 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -750,10 +750,10 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
         ITYPE_t k
 
         ITYPE_t[:, ::1] argkmin_indices
-        {{DTYPE_t}}[:, ::1] argkmin_distances
+        DTYPE_t[:, ::1] argkmin_distances
 
         # Used as array of pointers to private datastructures used in threads.
-        {{DTYPE_t}} ** heaps_r_distances_chunks
+        DTYPE_t ** heaps_r_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
     @classmethod
@@ -916,8 +916,8 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
         #   - when parallelizing on Y, the pointers of those heaps are referencing
         #   small heaps which are thread-wise-allocated and whose content will be
         #   merged with the main heaps'.
-        self.heaps_r_distances_chunks = <{{DTYPE_t}} **> malloc(
-            sizeof({{DTYPE_t}} *) * self.chunks_n_threads
+        self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.chunks_n_threads
         )
         self.heaps_indices_chunks = <ITYPE_t **> malloc(
             sizeof(ITYPE_t *) * self.chunks_n_threads
@@ -925,7 +925,7 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
 
         # Main heaps which will be returned as results by `PairwiseDistancesArgKmin{{bitness}}.compute`.
         self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
-        self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype={{DTYPE}})
+        self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
 
     def __dealloc__(self):
         if self.heaps_indices_chunks is not NULL:
@@ -946,7 +946,7 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
             ITYPE_t i, j
             ITYPE_t n_samples_X = X_end - X_start
             ITYPE_t n_samples_Y = Y_end - Y_start
-            {{DTYPE_t}} *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
         # Pushing the distances and their associated indices on a heap
@@ -1013,8 +1013,8 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
             # As chunks of X are shared across threads, so must their
             # heaps. To solve this, each thread has its own heaps
             # which are then synchronised back in the main ones.
-            self.heaps_r_distances_chunks[thread_num] = <{{DTYPE_t}} *> malloc(
-                heaps_size * sizeof({{DTYPE_t}})
+            self.heaps_r_distances_chunks[thread_num] = <DTYPE_t *> malloc(
+                heaps_size * sizeof(DTYPE_t)
             )
             self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(
                 heaps_size * sizeof(ITYPE_t)
@@ -1088,7 +1088,7 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
         cdef:
             ITYPE_t i, j
             ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
-            {{DTYPE_t}}[:, ::1] distances = self.argkmin_distances
+            DTYPE_t[:, ::1] distances = self.argkmin_distances
         for i in prange(self.n_samples_X, schedule='static', nogil=True,
                         num_threads=self.effective_n_threads):
             for j in range(self.k):
@@ -1402,7 +1402,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
             const {{DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :]
             const {{DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
-            {{DTYPE_t}} *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
             # Careful: LDA, LDB and LDC are given for F-ordered arrays
@@ -1416,10 +1416,10 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
             ITYPE_t m = X_c.shape[0]
             ITYPE_t n = Y_c.shape[0]
             ITYPE_t K = X_c.shape[1]
-            {{DTYPE_t}} alpha = - 2.
+            DTYPE_t alpha = - 2.
             ITYPE_t lda = X_c.shape[1]
             ITYPE_t ldb = X_c.shape[1]
-            {{DTYPE_t}} beta = 0.
+            DTYPE_t beta = 0.
             ITYPE_t ldc = Y_c.shape[0]
 {{if need_upcast}}
             # Those two buffers have been upcast from 32bit to 64bit previously.
@@ -1448,7 +1448,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
                     #
                     #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                     #
-                    <{{DTYPE_t}}> (
+                    (
                         self.X_norm_squared[i + X_start] +
                         dist_middle_terms[i * Y_c.shape[0] + j] +
                         self.Y_norm_squared[j + Y_start]

From 2f56a49edf41017391ac7c6cb3b084d3611732ad Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 11 Mar 2022 15:13:59 +0100
Subject: [PATCH 16/26] MAINT 32bit support for DistanceMetric

This has been extracted from 7645ba.
---
 .gitignore                                 |    2 +
 sklearn/metrics/_dist_metrics.pxd          |   87 --
 sklearn/metrics/_dist_metrics.pxd.tp       |  118 ++
 sklearn/metrics/_dist_metrics.pyx          | 1459 +++++++++++++++++---
 sklearn/metrics/_dist_metrics.pyx.tp       | 1319 ++++++++++++++++++
 sklearn/metrics/setup.py                   |    9 +
 sklearn/metrics/tests/test_dist_metrics.py |  141 +-
 7 files changed, 2821 insertions(+), 314 deletions(-)
 delete mode 100644 sklearn/metrics/_dist_metrics.pxd
 create mode 100644 sklearn/metrics/_dist_metrics.pxd.tp
 create mode 100644 sklearn/metrics/_dist_metrics.pyx.tp

diff --git a/.gitignore b/.gitignore
index d6ae51ec333f2..f4125316d7d41 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,3 +85,5 @@ sklearn/utils/_seq_dataset.pxd
 sklearn/utils/_weight_vector.pyx
 sklearn/utils/_weight_vector.pxd
 sklearn/linear_model/_sag_fast.pyx
+sklearn/metrics/_dist_metrics.pyx
+sklearn/metrics/_dist_metrics.pxd
diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
deleted file mode 100644
index e7c2f2ea2f926..0000000000000
--- a/sklearn/metrics/_dist_metrics.pxd
+++ /dev/null
@@ -1,87 +0,0 @@
-cimport numpy as np
-from libc.math cimport sqrt, exp
-
-from ..utils._typedefs cimport DTYPE_t, ITYPE_t
-
-######################################################################
-# Inline distance functions
-#
-#  We use these for the default (euclidean) case so that they can be
-#  inlined.  This leads to faster computation for the most common case
-cdef inline DTYPE_t euclidean_dist(const DTYPE_t* x1, const DTYPE_t* x2,
-                                   ITYPE_t size) nogil except -1:
-    cdef DTYPE_t tmp, d=0
-    cdef np.intp_t j
-    for j in range(size):
-        tmp = x1[j] - x2[j]
-        d += tmp * tmp
-    return sqrt(d)
-
-
-cdef inline DTYPE_t euclidean_rdist(const DTYPE_t* x1, const DTYPE_t* x2,
-                                    ITYPE_t size) nogil except -1:
-    cdef DTYPE_t tmp, d=0
-    cdef np.intp_t j
-    for j in range(size):
-        tmp = x1[j] - x2[j]
-        d += tmp * tmp
-    return d
-
-
-cdef inline DTYPE_t euclidean_dist_to_rdist(const DTYPE_t dist) nogil except -1:
-    return dist * dist
-
-
-cdef inline DTYPE_t euclidean_rdist_to_dist(const DTYPE_t dist) nogil except -1:
-    return sqrt(dist)
-
-
-######################################################################
-# DistanceMetric base class
-cdef class DistanceMetric:
-    # The following attributes are required for a few of the subclasses.
-    # we must define them here so that cython's limited polymorphism will work.
-    # Because we don't expect to instantiate a lot of these objects, the
-    # extra memory overhead of this setup should not be an issue.
-    cdef DTYPE_t p
-    cdef DTYPE_t[::1] vec
-    cdef DTYPE_t[:, ::1] mat
-    cdef ITYPE_t size
-    cdef object func
-    cdef object kwargs
-
-    cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                      ITYPE_t size) nogil except -1
-
-    cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                       ITYPE_t size) nogil except -1
-
-    cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
-
-    cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y,
-                   DTYPE_t[:, ::1] D) except -1
-
-    cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1
-
-    cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1
-
-
-######################################################################
-# DatasetsPair base class
-cdef class DatasetsPair:
-    cdef DistanceMetric distance_metric
-
-    cdef ITYPE_t n_samples_X(self) nogil
-
-    cdef ITYPE_t n_samples_Y(self) nogil
-
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
-
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
-
-
-cdef class DenseDenseDatasetsPair(DatasetsPair):
-    cdef:
-        const DTYPE_t[:, ::1] X
-        const DTYPE_t[:, ::1] Y
-        ITYPE_t d
diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp
new file mode 100644
index 0000000000000..ff3e01c66e564
--- /dev/null
+++ b/sklearn/metrics/_dist_metrics.pxd.tp
@@ -0,0 +1,118 @@
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    #       name_suffix, bitness, DTYPE_t, DTYPE
+    #
+    # On the first hand, an empty string is use for `name_suffix`
+    # for the 64bit case as to still be able to expose the original
+    # 64bit implementation under the same API, namely `DistanceMetric`.
+    #
+    # On the other hand, '32' bit is use for `name_suffix`
+    # for the 32bit case to remove ambiguity and use `DistanceMetric32`,
+    # which is not publicly exposed.
+    #
+    # The metric mapping is adapted accordingly to route to the correct
+    # implementations.
+    #
+    # We also use 64bit types as defined in `sklearn.utils._typedefs`
+    # to maintain backward compatibility as the symbol level for extra
+    # safety.
+    #
+    ('', '64', 'DTYPE_t', 'DTYPE'),
+    ('32', '32', 'np.float32_t', 'np.float32')
+]
+
+}}
+cimport numpy as np
+from libc.math cimport sqrt, exp
+
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
+
+{{for name_suffix, bitness, DTYPE_t, DTYPE in implementation_specific_values}}
+
+######################################################################
+# Inline distance functions
+#
+#  We use these for the default (euclidean) case so that they can be
+#  inlined.  This leads to faster computation for the most common case
+cdef inline DTYPE_t euclidean_dist{{name_suffix}}(const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                                   ITYPE_t size) nogil except -1:
+    cdef DTYPE_t tmp, d=0
+    cdef np.intp_t j
+    for j in range(size):
+        tmp = <DTYPE_t> (x1[j] - x2[j])
+        d += tmp * tmp
+    return sqrt(d)
+
+
+cdef inline DTYPE_t euclidean_rdist{{name_suffix}}(const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                                    ITYPE_t size) nogil except -1:
+    cdef DTYPE_t tmp, d=0
+    cdef np.intp_t j
+    for j in range(size):
+        tmp = x1[j] - x2[j]
+        d += tmp * tmp
+    return d
+
+
+cdef inline DTYPE_t euclidean_dist_to_rdist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1:
+    return dist * dist
+
+
+cdef inline DTYPE_t euclidean_rdist_to_dist{{name_suffix}}(const {{DTYPE_t}} dist) nogil except -1:
+    return sqrt(dist)
+
+
+######################################################################
+# DistanceMetric base class
+cdef class DistanceMetric{{name_suffix}}:
+    # The following attributes are required for a few of the subclasses.
+    # we must define them here so that cython's limited polymorphism will work.
+    # Because we don't expect to instantiate a lot of these objects, the
+    # extra memory overhead of this setup should not be an issue.
+    cdef {{DTYPE_t}} p
+    cdef {{DTYPE_t}}[::1] vec
+    cdef {{DTYPE_t}}[:, ::1] mat
+    cdef ITYPE_t size
+    cdef object func
+    cdef object kwargs
+
+    cdef DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                      ITYPE_t size) nogil except -1
+
+    cdef DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                       ITYPE_t size) nogil except -1
+
+    cdef int pdist(self, const {{DTYPE_t}}[:, ::1] X, {{DTYPE_t}}[:, ::1] D) except -1
+
+    cdef int cdist(self, const {{DTYPE_t}}[:, ::1] X, const {{DTYPE_t}}[:, ::1] Y,
+                   {{DTYPE_t}}[:, ::1] D) except -1
+
+    cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1
+
+    cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1
+
+
+######################################################################
+# DatasetsPair base class
+cdef class DatasetsPair{{name_suffix}}:
+    cdef DistanceMetric{{name_suffix}} distance_metric
+
+    cdef ITYPE_t n_samples_X(self) nogil
+
+    cdef ITYPE_t n_samples_Y(self) nogil
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+
+cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    cdef:
+        const {{DTYPE_t}}[:, ::1] X
+        const {{DTYPE_t}}[:, ::1] Y
+        ITYPE_t d
+
+{{endfor}}
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index d17be2c8cb73d..a64ea88f3b4a6 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -14,26 +14,16 @@ cdef extern from "arrayobject.h":
     object PyArray_SimpleNewFromData(int nd, np.npy_intp* dims,
                                      int typenum, void* data)
 
-
-cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n):
-    # Wrap a memory buffer with an ndarray. Warning: this is not robust.
-    # In particular, if x is deallocated before the returned array goes
-    # out of scope, this could cause memory errors.  Since there is not
-    # a possibility of this for our use-case, this should be safe.
-
-    # Note: this Segfaults unless np.import_array() is called above
-    return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
-
-
-from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
-cdef DTYPE_t INF = np.inf
-
 from scipy.sparse import csr_matrix, issparse
 from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DTYPECODE
 from ..utils._typedefs import DTYPE, ITYPE
 from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
 from ..utils import check_array
 
+cdef inline double fmax(double a, double b) nogil:
+    return max(a, b)
+
+
 ######################################################################
 # newObj function
 #  this is a helper function for pickling
@@ -41,35 +31,6 @@ def newObj(obj):
     return obj.__new__(obj)
 
 
-######################################################################
-# metric mappings
-#  These map from metric id strings to class names
-METRIC_MAPPING = {'euclidean': EuclideanDistance,
-                  'l2': EuclideanDistance,
-                  'minkowski': MinkowskiDistance,
-                  'p': MinkowskiDistance,
-                  'manhattan': ManhattanDistance,
-                  'cityblock': ManhattanDistance,
-                  'l1': ManhattanDistance,
-                  'chebyshev': ChebyshevDistance,
-                  'infinity': ChebyshevDistance,
-                  'seuclidean': SEuclideanDistance,
-                  'mahalanobis': MahalanobisDistance,
-                  'wminkowski': WMinkowskiDistance,
-                  'hamming': HammingDistance,
-                  'canberra': CanberraDistance,
-                  'braycurtis': BrayCurtisDistance,
-                  'matching': MatchingDistance,
-                  'jaccard': JaccardDistance,
-                  'dice': DiceDistance,
-                  'kulsinski': KulsinskiDistance,
-                  'rogerstanimoto': RogersTanimotoDistance,
-                  'russellrao': RussellRaoDistance,
-                  'sokalmichener': SokalMichenerDistance,
-                  'sokalsneath': SokalSneathDistance,
-                  'haversine': HaversineDistance,
-                  'pyfunc': PyFuncDistance}
-
 BOOL_METRICS = [
     "hamming",
     "matching",
@@ -95,6 +56,51 @@ def get_valid_metric_ids(L):
             if (val.__name__ in L) or (val in L)]
 
 
+######################################################################
+# metric mappings
+#  These map from metric id strings to class names
+METRIC_MAPPING = {
+    'euclidean': EuclideanDistance,
+    'l2': EuclideanDistance,
+    'minkowski': MinkowskiDistance,
+    'p': MinkowskiDistance,
+    'manhattan': ManhattanDistance,
+    'cityblock': ManhattanDistance,
+    'l1': ManhattanDistance,
+    'chebyshev': ChebyshevDistance,
+    'infinity': ChebyshevDistance,
+    'seuclidean': SEuclideanDistance,
+    'mahalanobis': MahalanobisDistance,
+    'wminkowski': WMinkowskiDistance,
+    'hamming': HammingDistance,
+    'canberra': CanberraDistance,
+    'braycurtis': BrayCurtisDistance,
+    'matching': MatchingDistance,
+    'jaccard': JaccardDistance,
+    'dice': DiceDistance,
+    'kulsinski': KulsinskiDistance,
+    'rogerstanimoto': RogersTanimotoDistance,
+    'russellrao': RussellRaoDistance,
+    'sokalmichener': SokalMichenerDistance,
+    'sokalsneath': SokalSneathDistance,
+    'haversine': HaversineDistance,
+    'pyfunc': PyFuncDistance,
+}
+
+cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n):
+    # Wrap a memory buffer with an ndarray. Warning: this is not robust.
+    # In particular, if x is deallocated before the returned array goes
+    # out of scope, this could cause memory errors.  Since there is not
+    # a possibility of this for our use-case, this should be safe.
+
+    # Note: this Segfaults unless np.import_array() is called above
+    return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
+
+
+from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
+cdef DTYPE_t INF = np.inf
+
+
 ######################################################################
 # Distance Metric Classes
 cdef class DistanceMetric:
@@ -482,8 +488,8 @@ cdef class SEuclideanDistance(DistanceMetric):
         cdef DTYPE_t tmp, d=0
         cdef np.intp_t j
         for j in range(size):
-            tmp = x1[j] - x2[j]
-            d += tmp * tmp / self.vec[j]
+            tmp = <DTYPE_t> (x1[j] - x2[j])
+            d += <DTYPE_t> (tmp * tmp / self.vec[j])
         return d
 
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
@@ -520,7 +526,7 @@ cdef class ManhattanDistance(DistanceMetric):
         cdef DTYPE_t d = 0
         cdef np.intp_t j
         for j in range(size):
-            d += fabs(x1[j] - x2[j])
+            d += <DTYPE_t> fabs(x1[j] - x2[j])
         return d
 
 
@@ -553,7 +559,7 @@ cdef class ChebyshevDistance(DistanceMetric):
         cdef DTYPE_t d = 0
         cdef np.intp_t j
         for j in range(size):
-            d = fmax(d, fabs(x1[j] - x2[j]))
+            d = <DTYPE_t> fmax(d, fabs(x1[j] - x2[j]))
         return d
 
 
@@ -617,21 +623,21 @@ cdef class MinkowskiDistance(DistanceMetric):
         cdef bint has_w = self.size > 0
         if has_w:
             for j in range(size):
-                d += self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p)
+                d += <DTYPE_t> (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p))
         else:
             for j in range(size):
-                d += pow(fabs(x1[j] - x2[j]), self.p)
+                d += <DTYPE_t> (pow(fabs(x1[j] - x2[j]), self.p))
         return d
 
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
-        return pow(self.rdist(x1, x2, size), 1. / self.p)
+        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
 
     cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        return pow(rdist, 1. / self.p)
+        return <DTYPE_t> pow(rdist, 1. / self.p)
 
     cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        return pow(dist, self.p)
+        return <DTYPE_t> pow(dist, self.p)
 
     def rdist_to_dist(self, rdist):
         return rdist ** (1. / self.p)
@@ -686,21 +692,21 @@ cdef class WMinkowskiDistance(DistanceMetric):
 
     cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                               ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d=0
+        cdef DTYPE_t d = 0
         cdef np.intp_t j
         for j in range(size):
-            d += pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p)
+            d += <DTYPE_t> (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p))
         return d
 
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
-        return pow(self.rdist(x1, x2, size), 1. / self.p)
+        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
 
     cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        return pow(rdist, 1. / self.p)
+        return <DTYPE_t> pow(rdist, 1. / self.p)
 
     cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        return pow(dist, self.p)
+        return <DTYPE_t> pow(dist, self.p)
 
     def rdist_to_dist(self, rdist):
         return rdist ** (1. / self.p)
@@ -736,7 +742,7 @@ cdef class MahalanobisDistance(DistanceMetric):
         if VI.ndim != 2 or VI.shape[0] != VI.shape[1]:
             raise ValueError("V/VI must be square")
 
-        self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype=float, order='C'))
+        self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype=DTYPE, order='C'))
 
         self.size = self.mat.shape[0]
 
@@ -819,9 +825,9 @@ cdef class CanberraDistance(DistanceMetric):
         cdef DTYPE_t denom, d = 0
         cdef np.intp_t j
         for j in range(size):
-            denom = fabs(x1[j]) + fabs(x2[j])
+            denom = <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
             if denom > 0:
-                d += fabs(x1[j] - x2[j]) / denom
+                d += <DTYPE_t>(fabs(x1[j] - x2[j])) / denom
         return d
 
 
@@ -842,8 +848,8 @@ cdef class BrayCurtisDistance(DistanceMetric):
         cdef DTYPE_t num = 0, denom = 0
         cdef np.intp_t j
         for j in range(size):
-            num += fabs(x1[j] - x2[j])
-            denom += fabs(x1[j]) + fabs(x2[j])
+            num += <DTYPE_t> fabs(x1[j] - x2[j])
+            denom += <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
         if denom > 0:
             return num / denom
         else:
@@ -860,8 +866,6 @@ cdef class JaccardDistance(DistanceMetric):
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} + N_{TF} + N_{FT}}
     """
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
@@ -890,8 +894,6 @@ cdef class MatchingDistance(DistanceMetric):
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{N_{TF} + N_{FT}}{N}
     """
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
@@ -914,8 +916,6 @@ cdef class DiceDistance(DistanceMetric):
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{N_{TF} + N_{FT}}{2 * N_{TT} + N_{TF} + N_{FT}}
     """
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
@@ -939,8 +939,6 @@ cdef class KulsinskiDistance(DistanceMetric):
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = 1 - \frac{N_{TT}}{N + N_{TF} + N_{FT}}
     """
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
@@ -964,8 +962,6 @@ cdef class RogersTanimotoDistance(DistanceMetric):
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}}
     """
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
@@ -988,8 +984,6 @@ cdef class RussellRaoDistance(DistanceMetric):
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{N - N_{TT}}{N}
     """
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
@@ -1012,8 +1006,6 @@ cdef class SokalMichenerDistance(DistanceMetric):
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}}
     """
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
@@ -1036,8 +1028,6 @@ cdef class SokalSneathDistance(DistanceMetric):
     vectors. All nonzero entries will be treated as True, zero entries will
     be treated as False.
 
-    .. math::
-       D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} / 2 + N_{TF} + N_{FT}}
     """
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
@@ -1063,9 +1053,6 @@ cdef class HaversineDistance(DistanceMetric):
     to be the latitude, the second is the longitude, given in radians.
     The dimension of the points must be 2:
 
-    .. math::
-       D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)
-                                + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]
     """
 
     def _validate_data(self, X):
@@ -1075,8 +1062,8 @@ cdef class HaversineDistance(DistanceMetric):
 
     cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                               ITYPE_t size) nogil except -1:
-        cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0]))
-        cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1]))
+        cdef DTYPE_t sin_0 = <DTYPE_t> sin(0.5 * (x1[0] - x2[0]))
+        cdef DTYPE_t sin_1 = <DTYPE_t> sin(0.5 * (x1[1] - x2[1]))
         return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
 
     cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
@@ -1087,7 +1074,7 @@ cdef class HaversineDistance(DistanceMetric):
         return 2 * asin(sqrt(rdist))
 
     cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        cdef DTYPE_t tmp = sin(0.5 * dist)
+        cdef DTYPE_t tmp = <DTYPE_t> sin(0.5 * dist)
         return tmp * tmp
 
     def rdist_to_dist(self, rdist):
@@ -1097,72 +1084,6 @@ cdef class HaversineDistance(DistanceMetric):
         tmp = np.sin(0.5 * dist)
         return tmp * tmp
 
-
-#------------------------------------------------------------
-# Yule Distance (boolean)
-#  D(x, y) = 2 * ntf * nft / (ntt * nff + ntf * nft)
-# [This is not a true metric, so we will leave it out.]
-#
-#cdef class YuleDistance(DistanceMetric):
-#    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-#                             ITYPE_t size):
-#        cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0
-#        cdef np.intp_t j
-#        for j in range(size):
-#            tf1 = x1[j] != 0
-#            tf2 = x2[j] != 0
-#            ntt += tf1 and tf2
-#            ntf += tf1 and (tf2 == 0)
-#            nft += (tf1 == 0) and tf2
-#        nff = size - ntt - ntf - nft
-#        return (2.0 * ntf * nft) / (ntt * nff + ntf * nft)
-
-
-#------------------------------------------------------------
-# Cosine Distance
-#  D(x, y) = dot(x, y) / (|x| * |y|)
-# [This is not a true metric, so we will leave it out.]
-#
-#cdef class CosineDistance(DistanceMetric):
-#    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-#                             ITYPE_t size):
-#        cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0
-#        cdef np.intp_t j
-#        for j in range(size):
-#            d += x1[j] * x2[j]
-#            norm1 += x1[j] * x1[j]
-#            norm2 += x2[j] * x2[j]
-#        return 1.0 - d / sqrt(norm1 * norm2)
-
-
-#------------------------------------------------------------
-# Correlation Distance
-#  D(x, y) = dot((x - mx), (y - my)) / (|x - mx| * |y - my|)
-# [This is not a true metric, so we will leave it out.]
-#
-#cdef class CorrelationDistance(DistanceMetric):
-#    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-#                             ITYPE_t size):
-#        cdef DTYPE_t mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0
-#        cdef DTYPE_t tmp1, tmp2
-#
-#        cdef np.intp_t i
-#        for i in range(size):
-#            mu1 += x1[i]
-#            mu2 += x2[i]
-#        mu1 /= size
-#        mu2 /= size
-#
-#        for i in range(size):
-#            tmp1 = x1[i] - mu1
-#            tmp2 = x2[i] - mu2
-#            x1nrm += tmp1 * tmp1
-#            x2nrm += tmp2 * tmp2
-#            x1Tx2 += tmp1 * tmp2
-#
-#        return (1. - x1Tx2) / sqrt(x1nrm * x2nrm)
-
-
 #------------------------------------------------------------
 # User-defined distance
 #
@@ -1205,10 +1126,6 @@ cdef class PyFuncDistance(DistanceMetric):
                             "vectors and return a float.")
 
 
-cdef inline double fmax(double a, double b) nogil:
-    return max(a, b)
-
-
 ######################################################################
 # Datasets Pair Classes
 cdef class DatasetsPair:
@@ -1285,8 +1202,8 @@ cdef class DatasetsPair:
 
         if not(X.dtype == Y.dtype == np.float64):
             raise ValueError(
-                f"Only 64bit float datasets are supported at this time, "
-                f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}"
+                f"Datasets must be of np.float64 type. "
+                f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
             )
 
         # Metric-specific checks that do not replace nor duplicate `check_array`.
@@ -1368,3 +1285,1233 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         return self.distance_metric.dist(&self.X[i, 0],
                                          &self.Y[j, 0],
                                          self.d)
+
+######################################################################
+# metric mappings
+#  These map from metric id strings to class names
+METRIC_MAPPING32 = {
+    'euclidean': EuclideanDistance32,
+    'l2': EuclideanDistance32,
+    'minkowski': MinkowskiDistance32,
+    'p': MinkowskiDistance32,
+    'manhattan': ManhattanDistance32,
+    'cityblock': ManhattanDistance32,
+    'l1': ManhattanDistance32,
+    'chebyshev': ChebyshevDistance32,
+    'infinity': ChebyshevDistance32,
+    'seuclidean': SEuclideanDistance32,
+    'mahalanobis': MahalanobisDistance32,
+    'wminkowski': WMinkowskiDistance32,
+    'hamming': HammingDistance32,
+    'canberra': CanberraDistance32,
+    'braycurtis': BrayCurtisDistance32,
+    'matching': MatchingDistance32,
+    'jaccard': JaccardDistance32,
+    'dice': DiceDistance32,
+    'kulsinski': KulsinskiDistance32,
+    'rogerstanimoto': RogersTanimotoDistance32,
+    'russellrao': RussellRaoDistance32,
+    'sokalmichener': SokalMichenerDistance32,
+    'sokalsneath': SokalSneathDistance32,
+    'haversine': HaversineDistance32,
+    'pyfunc': PyFuncDistance32,
+}
+
+cdef inline np.ndarray _buffer_to_ndarray32(const np.float32_t* x, np.npy_intp n):
+    # Wrap a memory buffer with an ndarray. Warning: this is not robust.
+    # In particular, if x is deallocated before the returned array goes
+    # out of scope, this could cause memory errors.  Since there is not
+    # a possibility of this for our use-case, this should be safe.
+
+    # Note: this Segfaults unless np.import_array() is called above
+    return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
+
+
+from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
+cdef np.float32_t INF32 = np.inf
+
+
+######################################################################
+# Distance Metric Classes
+cdef class DistanceMetric32:
+    """DistanceMetric class
+
+    This class provides a uniform interface to fast distance metric
+    functions.  The various metrics can be accessed via the :meth:`get_metric`
+    class method and the metric string identifier (see below).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import DistanceMetric
+    >>> dist = DistanceMetric.get_metric('euclidean')
+    >>> X = [[0, 1, 2],
+             [3, 4, 5]]
+    >>> dist.pairwise(X)
+    array([[ 0.        ,  5.19615242],
+           [ 5.19615242,  0.        ]])
+
+    Available Metrics
+
+    The following lists the string metric identifiers and the associated
+    distance metric classes:
+
+    **Metrics intended for real-valued vector spaces:**
+
+    ==============  ====================  ========  ===============================
+    identifier      class name            args      distance function
+    --------------  --------------------  --------  -------------------------------
+    "euclidean"     EuclideanDistance     -         ``sqrt(sum((x - y)^2))``
+    "manhattan"     ManhattanDistance     -         ``sum(|x - y|)``
+    "chebyshev"     ChebyshevDistance     -         ``max(|x - y|)``
+    "minkowski"     MinkowskiDistance     p, w      ``sum(w * |x - y|^p)^(1/p)``
+    "wminkowski"    WMinkowskiDistance    p, w      ``sum(|w * (x - y)|^p)^(1/p)``
+    "seuclidean"    SEuclideanDistance    V         ``sqrt(sum((x - y)^2 / V))``
+    "mahalanobis"   MahalanobisDistance   V or VI   ``sqrt((x - y)' V^-1 (x - y))``
+    ==============  ====================  ========  ===============================
+
+    .. deprecated:: 1.1
+        `WMinkowskiDistance` is deprecated in version 1.1 and will be removed in version 1.3.
+        Use `MinkowskiDistance` instead. Note that in `MinkowskiDistance`, the weights are
+        applied to the absolute differences already raised to the p power. This is different from
+        `WMinkowskiDistance` where weights are applied to the absolute differences before raising
+        to the p power. The deprecation aims to remain consistent with SciPy 1.8 convention.
+
+    **Metrics intended for two-dimensional vector spaces:**  Note that the haversine
+    distance metric requires data in the form of [latitude, longitude] and both
+    inputs and outputs are in units of radians.
+
+    ============  ==================  ===============================================================
+    identifier    class name          distance function
+    ------------  ------------------  ---------------------------------------------------------------
+    "haversine"   HaversineDistance   ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))``
+    ============  ==================  ===============================================================
+
+
+    **Metrics intended for integer-valued vector spaces:**  Though intended
+    for integer-valued vectors, these are also valid metrics in the case of
+    real-valued vectors.
+
+    =============  ====================  ========================================
+    identifier     class name            distance function
+    -------------  --------------------  ----------------------------------------
+    "hamming"      HammingDistance       ``N_unequal(x, y) / N_tot``
+    "canberra"     CanberraDistance      ``sum(|x - y| / (|x| + |y|))``
+    "braycurtis"   BrayCurtisDistance    ``sum(|x - y|) / (sum(|x|) + sum(|y|))``
+    =============  ====================  ========================================
+
+    **Metrics intended for boolean-valued vector spaces:**  Any nonzero entry
+    is evaluated to "True".  In the listings below, the following
+    abbreviations are used:
+
+     - N  : number of dimensions
+     - NTT : number of dims in which both values are True
+     - NTF : number of dims in which the first value is True, second is False
+     - NFT : number of dims in which the first value is False, second is True
+     - NFF : number of dims in which both values are False
+     - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT
+     - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT
+
+    =================  =======================  ===============================
+    identifier         class name               distance function
+    -----------------  -----------------------  -------------------------------
+    "jaccard"          JaccardDistance          NNEQ / NNZ
+    "matching"         MatchingDistance         NNEQ / N
+    "dice"             DiceDistance             NNEQ / (NTT + NNZ)
+    "kulsinski"        KulsinskiDistance        (NNEQ + N - NTT) / (NNEQ + N)
+    "rogerstanimoto"   RogersTanimotoDistance   2 * NNEQ / (N + NNEQ)
+    "russellrao"       RussellRaoDistance       (N - NTT) / N
+    "sokalmichener"    SokalMichenerDistance    2 * NNEQ / (N + NNEQ)
+    "sokalsneath"      SokalSneathDistance      NNEQ / (NNEQ + 0.5 * NTT)
+    =================  =======================  ===============================
+
+    **User-defined distance:**
+
+    ===========    ===============    =======
+    identifier     class name         args
+    -----------    ---------------    -------
+    "pyfunc"       PyFuncDistance     func
+    ===========    ===============    =======
+
+    Here ``func`` is a function which takes two one-dimensional numpy
+    arrays, and returns a distance.  Note that in order to be used within
+    the BallTree, the distance must be a true metric:
+    i.e. it must satisfy the following properties
+
+    1) Non-negativity: d(x, y) >= 0
+    2) Identity: d(x, y) = 0 if and only if x == y
+    3) Symmetry: d(x, y) = d(y, x)
+    4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)
+
+    Because of the Python object overhead involved in calling the python
+    function, this will be fairly slow, but it will have the same
+    scaling as other distances.
+    """
+    def __cinit__(self):
+        self.p = 2
+        self.vec = np.zeros(1, dtype=np.float32, order='C')
+        self.mat = np.zeros((1, 1), dtype=np.float32, order='C')
+        self.size = 1
+
+    def __reduce__(self):
+        """
+        reduce method used for pickling
+        """
+        return (newObj, (self.__class__,), self.__getstate__())
+
+    def __getstate__(self):
+        """
+        get state for pickling
+        """
+        if self.__class__.__name__ == "PyFuncDistance32":
+            return (float(self.p), np.asarray(self.vec), np.asarray(self.mat), self.func, self.kwargs)
+        return (float(self.p), np.asarray(self.vec), np.asarray(self.mat))
+
+    def __setstate__(self, state):
+        """
+        set state for pickling
+        """
+        self.p = state[0]
+        self.vec = ReadonlyArrayWrapper(state[1])
+        self.mat = ReadonlyArrayWrapper(state[2])
+        if self.__class__.__name__ == "PyFuncDistance32":
+            self.func = state[3]
+            self.kwargs = state[4]
+        self.size = self.vec.shape[0]
+
+    @classmethod
+    def get_metric(cls, metric, **kwargs):
+        """Get the given distance metric from the string identifier.
+
+        See the docstring of DistanceMetric for a list of available metrics.
+
+        Parameters
+        ----------
+        metric : str or class name
+            The distance metric to use
+        **kwargs
+            additional arguments will be passed to the requested metric
+        """
+        if isinstance(metric, DistanceMetric32):
+            return metric
+
+        if callable(metric):
+            return PyFuncDistance32(metric, **kwargs)
+
+        # Map the metric string ID to the metric class
+        if isinstance(metric, type) and issubclass(metric, DistanceMetric32):
+            pass
+        else:
+            try:
+                metric = METRIC_MAPPING32[metric]
+            except:
+                raise ValueError("Unrecognized metric '%s'" % metric)
+
+        # In Minkowski special cases, return more efficient methods
+        if metric is MinkowskiDistance32:
+            p = kwargs.pop('p', 2)
+            w = kwargs.pop('w', None)
+            if p == 1 and w is None:
+                return ManhattanDistance32(**kwargs)
+            elif p == 2 and w is None:
+                return EuclideanDistance32(**kwargs)
+            elif np.isinf(p) and w is None:
+                return ChebyshevDistance32(**kwargs)
+            else:
+                return MinkowskiDistance32(p, w, **kwargs)
+        else:
+            return metric(**kwargs)
+
+    def __init__(self):
+        if self.__class__ is DistanceMetric32:
+            raise NotImplementedError("DistanceMetric32 is an abstract class")
+
+    def _validate_data(self, X):
+        """Validate the input data.
+
+        This should be overridden in a base class if a specific input format
+        is required.
+        """
+        return
+
+    cdef DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                      ITYPE_t size) nogil except -1:
+        """Compute the distance between vectors x1 and x2
+
+        This should be overridden in a base class.
+        """
+        return -999
+
+    cdef DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
+                       ITYPE_t size) nogil except -1:
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2.
+
+        This can optionally be overridden in a base class.
+
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
+        """
+        return self.dist(x1, x2, size)
+
+    cdef int pdist(self, const np.float32_t[:, ::1] X, np.float32_t[:, ::1] D) except -1:
+        """compute the pairwise distances between points in X"""
+        cdef ITYPE_t i1, i2
+        for i1 in range(X.shape[0]):
+            for i2 in range(i1, X.shape[0]):
+                D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1])
+                D[i2, i1] = D[i1, i2]
+        return 0
+
+    cdef int cdist(self, const np.float32_t[:, ::1] X, const np.float32_t[:, ::1] Y,
+                   np.float32_t[:, ::1] D) except -1:
+        """compute the cross-pairwise distances between arrays X and Y"""
+        cdef ITYPE_t i1, i2
+        if X.shape[1] != Y.shape[1]:
+            raise ValueError('X and Y must have the same second dimension')
+        for i1 in range(X.shape[0]):
+            for i2 in range(Y.shape[0]):
+                D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1])
+        return 0
+
+    cdef DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
+        """Convert the rank-preserving surrogate distance to the distance"""
+        return rdist
+
+    cdef DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
+        """Convert the distance to the rank-preserving surrogate distance"""
+        return dist
+
+    def rdist_to_dist(self, rdist):
+        """Convert the rank-preserving surrogate distance to the distance.
+
+        The surrogate distance is any measure that yields the same rank as the
+        distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
+
+        Parameters
+        ----------
+        rdist : double
+            Surrogate distance.
+
+        Returns
+        -------
+        double
+            True distance.
+        """
+        return rdist
+
+    def dist_to_rdist(self, dist):
+        """Convert the true distance to the rank-preserving surrogate distance.
+
+        The surrogate distance is any measure that yields the same rank as the
+        distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
+
+        Parameters
+        ----------
+        dist : double
+            True distance.
+
+        Returns
+        -------
+        double
+            Surrogate distance.
+        """
+        return dist
+
+    def pairwise(self, X, Y=None):
+        """Compute the pairwise distances between X and Y
+
+        This is a convenience routine for the sake of testing.  For many
+        metrics, the utilities in scipy.spatial.distance.cdist and
+        scipy.spatial.distance.pdist will be faster.
+
+        Parameters
+        ----------
+        X : array-like
+            Array of shape (Nx, D), representing Nx points in D dimensions.
+        Y : array-like (optional)
+            Array of shape (Ny, D), representing Ny points in D dimensions.
+            If not specified, then Y=X.
+
+        Returns
+        -------
+        dist : ndarray
+            The shape (Nx, Ny) array of pairwise distances between points in
+            X and Y.
+        """
+        cdef np.ndarray[np.float32_t, ndim=2, mode='c'] Xarr
+        cdef np.ndarray[np.float32_t, ndim=2, mode='c'] Yarr
+        cdef np.ndarray[np.float32_t, ndim=2, mode='c'] Darr
+
+        Xarr = np.asarray(X, dtype=np.float32, order='C')
+        self._validate_data(Xarr)
+        if Y is None:
+            Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]),
+                         dtype=np.float32, order='C')
+            self.pdist(Xarr, Darr)
+        else:
+            Yarr = np.asarray(Y, dtype=np.float32, order='C')
+            self._validate_data(Yarr)
+            Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]),
+                         dtype=np.float32, order='C')
+            self.cdist(Xarr, Yarr, Darr)
+        return Darr
+
+
+#------------------------------------------------------------
+# Euclidean Distance
+#  d = sqrt(sum(x_i^2 - y_i^2))
+cdef class EuclideanDistance32(DistanceMetric32):
+    r"""Euclidean Distance metric
+
+    .. math::
+       D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 }
+    """
+    def __init__(self):
+        self.p = 2
+
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        return euclidean_dist32(x1, x2, size)
+
+    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
+                              ITYPE_t size) nogil except -1:
+        return euclidean_rdist32(x1, x2, size)
+
+    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
+        return sqrt(rdist)
+
+    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
+        return dist * dist
+
+    def rdist_to_dist(self, rdist):
+        return np.sqrt(rdist)
+
+    def dist_to_rdist(self, dist):
+        return dist ** 2
+
+
+#------------------------------------------------------------
+# SEuclidean Distance
+#  d = sqrt(sum((x_i - y_i2)^2 / v_i))
+cdef class SEuclideanDistance32(DistanceMetric32):
+    r"""Standardized Euclidean Distance metric
+
+    .. math::
+       D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} }
+    """
+    def __init__(self, V):
+        self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype=np.float32))
+        self.size = self.vec.shape[0]
+        self.p = 2
+
+    def _validate_data(self, X):
+        if X.shape[1] != self.size:
+            raise ValueError('SEuclidean dist: size of V does not match')
+
+    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
+                              ITYPE_t size) nogil except -1:
+        cdef DTYPE_t tmp, d=0
+        cdef np.intp_t j
+        for j in range(size):
+            tmp = <DTYPE_t> (x1[j] - x2[j])
+            d += <DTYPE_t> (tmp * tmp / self.vec[j])
+        return d
+
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        return sqrt(self.rdist(x1, x2, size))
+
+    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
+        return sqrt(rdist)
+
+    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
+        return dist * dist
+
+    def rdist_to_dist(self, rdist):
+        return np.sqrt(rdist)
+
+    def dist_to_rdist(self, dist):
+        return dist ** 2
+
+
+#------------------------------------------------------------
+# Manhattan Distance
+#  d = sum(abs(x_i - y_i))
+cdef class ManhattanDistance32(DistanceMetric32):
+    r"""Manhattan/City-block Distance metric
+
+    .. math::
+       D(x, y) = \sum_i |x_i - y_i|
+    """
+    def __init__(self):
+        self.p = 1
+
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef DTYPE_t d = 0
+        cdef np.intp_t j
+        for j in range(size):
+            d += <DTYPE_t> fabs(x1[j] - x2[j])
+        return d
+
+
+#------------------------------------------------------------
+# Chebyshev Distance
+#  d = max_i(abs(x_i - y_i))
+cdef class ChebyshevDistance32(DistanceMetric32):
+    """Chebyshev/Infinity Distance
+
+    .. math::
+       D(x, y) = max_i (|x_i - y_i|)
+
+    Examples
+    --------
+    >>> from sklearn.metrics.dist_metrics import DistanceMetric
+    >>> dist = DistanceMetric.get_metric('chebyshev')
+    >>> X = [[0, 1, 2],
+    ...      [3, 4, 5]]
+    >>> Y = [[-1, 0, 1],
+    ...      [3, 4, 5]]
+    >>> dist.pairwise(X, Y)
+    array([[1.732..., 5.196...],
+           [6.928..., 0....   ]])
+    """
+    def __init__(self):
+        self.p = INF32
+
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef DTYPE_t d = 0
+        cdef np.intp_t j
+        for j in range(size):
+            d = <DTYPE_t> fmax(d, fabs(x1[j] - x2[j]))
+        return d
+
+
+#------------------------------------------------------------
+# Minkowski Distance
+cdef class MinkowskiDistance32(DistanceMetric32):
+    r"""Minkowski Distance
+
+    .. math::
+        D(x, y) = {||u-v||}_p
+
+    when w is None.
+
+    Here is the more general expanded expression for the weighted case:
+
+    .. math::
+        D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p)
+
+    Parameters
+    ----------
+    p : int
+        The order of the p-norm of the difference (see above).
+    w : (N,) array-like (optional)
+        The weight vector.
+
+    Minkowski Distance requires p >= 1 and finite. For p = infinity,
+    use ChebyshevDistance.
+    Note that for p=1, ManhattanDistance is more efficient, and for
+    p=2, EuclideanDistance is more efficient.
+    """
+    def __init__(self, p, w=None):
+        if p < 1:
+            raise ValueError("p must be greater than 1")
+        elif np.isinf(p):
+            raise ValueError("MinkowskiDistance requires finite p. "
+                             "For p=inf, use ChebyshevDistance.")
+
+        self.p = p
+        if w is not None:
+            w_array = check_array(
+                w, ensure_2d=False, dtype=np.float32, input_name="w"
+            )
+            if (w_array < 0).any():
+                raise ValueError("w cannot contain negative weights")
+            self.vec = ReadonlyArrayWrapper(w_array)
+            self.size = self.vec.shape[0]
+        else:
+            self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=np.float32))
+            self.size = 0
+
+    def _validate_data(self, X):
+        if self.size > 0 and X.shape[1] != self.size:
+            raise ValueError("MinkowskiDistance: the size of w must match "
+                             f"the number of features ({X.shape[1]}). "
+                             f"Currently len(w)={self.size}.")
+
+    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
+                              ITYPE_t size) nogil except -1:
+        cdef DTYPE_t d=0
+        cdef np.intp_t j
+        cdef bint has_w = self.size > 0
+        if has_w:
+            for j in range(size):
+                d += <DTYPE_t> (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p))
+        else:
+            for j in range(size):
+                d += <DTYPE_t> (pow(fabs(x1[j] - x2[j]), self.p))
+        return d
+
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
+
+    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
+        return <DTYPE_t> pow(rdist, 1. / self.p)
+
+    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
+        return <DTYPE_t> pow(dist, self.p)
+
+    def rdist_to_dist(self, rdist):
+        return rdist ** (1. / self.p)
+
+    def dist_to_rdist(self, dist):
+        return dist ** self.p
+
+
+#------------------------------------------------------------
+# TODO: Remove in 1.3 - WMinkowskiDistance class
+# W-Minkowski Distance
+cdef class WMinkowskiDistance32(DistanceMetric32):
+    r"""Weighted Minkowski Distance
+
+    .. math::
+       D(x, y) = [\sum_i |w_i * (x_i - y_i)|^p] ^ (1/p)
+
+    Weighted Minkowski Distance requires p >= 1 and finite.
+
+    Parameters
+    ----------
+    p : int
+        The order of the norm of the difference :math:`{||u-v||}_p`.
+    w : (N,) array-like
+        The weight vector.
+
+    """
+    def __init__(self, p, w):
+        from warnings import warn
+        warn("WMinkowskiDistance is deprecated in version 1.1 and will be "
+            "removed in version 1.3. Use MinkowskiDistance instead. Note "
+            "that in MinkowskiDistance, the weights are applied to the "
+            "absolute differences raised to the p power. This is different "
+            "from WMinkowskiDistance where weights are applied to the "
+            "absolute differences before raising to the p power. "
+            "The deprecation aims to remain consistent with SciPy 1.8 "
+            "convention.", FutureWarning)
+
+        if p < 1:
+            raise ValueError("p must be greater than 1")
+        elif np.isinf(p):
+            raise ValueError("WMinkowskiDistance requires finite p. "
+                             "For p=inf, use ChebyshevDistance.")
+        self.p = p
+        self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype=np.float32))
+        self.size = self.vec.shape[0]
+
+    def _validate_data(self, X):
+        if X.shape[1] != self.size:
+            raise ValueError('WMinkowskiDistance dist: '
+                             'size of w does not match')
+
+    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
+                              ITYPE_t size) nogil except -1:
+        cdef DTYPE_t d = 0
+        cdef np.intp_t j
+        for j in range(size):
+            d += <DTYPE_t> (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p))
+        return d
+
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
+
+    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
+        return <DTYPE_t> pow(rdist, 1. / self.p)
+
+    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
+        return <DTYPE_t> pow(dist, self.p)
+
+    def rdist_to_dist(self, rdist):
+        return rdist ** (1. / self.p)
+
+    def dist_to_rdist(self, dist):
+        return dist ** self.p
+
+
+#------------------------------------------------------------
+# Mahalanobis Distance
+#  d = sqrt( (x - y)^T V^-1 (x - y) )
+cdef class MahalanobisDistance32(DistanceMetric32):
+    """Mahalanobis Distance
+
+    .. math::
+       D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) }
+
+    Parameters
+    ----------
+    V : array-like
+        Symmetric positive-definite covariance matrix.
+        The inverse of this matrix will be explicitly computed.
+    VI : array-like
+        optionally specify the inverse directly.  If VI is passed,
+        then V is not referenced.
+    """
+    def __init__(self, V=None, VI=None):
+        if VI is None:
+            if V is None:
+                raise ValueError("Must provide either V or VI "
+                                 "for Mahalanobis distance")
+            VI = np.linalg.inv(V)
+        if VI.ndim != 2 or VI.shape[0] != VI.shape[1]:
+            raise ValueError("V/VI must be square")
+
+        self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype=np.float32, order='C'))
+
+        self.size = self.mat.shape[0]
+
+        # we need vec as a work buffer
+        self.vec = np.zeros(self.size, dtype=np.float32)
+
+    def _validate_data(self, X):
+        if X.shape[1] != self.size:
+            raise ValueError('Mahalanobis dist: size of V does not match')
+
+    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
+                              ITYPE_t size) nogil except -1:
+        cdef DTYPE_t tmp, d = 0
+        cdef np.intp_t i, j
+
+        # compute (x1 - x2).T * VI * (x1 - x2)
+        for i in range(size):
+            self.vec[i] = x1[i] - x2[i]
+
+        for i in range(size):
+            tmp = 0
+            for j in range(size):
+                tmp += self.mat[i, j] * self.vec[j]
+            d += tmp * self.vec[i]
+        return d
+
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        return sqrt(self.rdist(x1, x2, size))
+
+    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
+        return sqrt(rdist)
+
+    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
+        return dist * dist
+
+    def rdist_to_dist(self, rdist):
+        return np.sqrt(rdist)
+
+    def dist_to_rdist(self, dist):
+        return dist ** 2
+
+
+#------------------------------------------------------------
+# Hamming Distance
+#  d = N_unequal(x, y) / N_tot
+cdef class HammingDistance32(DistanceMetric32):
+    r"""Hamming Distance
+
+    Hamming distance is meant for discrete-valued vectors, though it is
+    a valid metric for real-valued vectors.
+
+    .. math::
+       D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int n_unequal = 0
+        cdef np.intp_t j
+        for j in range(size):
+            if x1[j] != x2[j]:
+                n_unequal += 1
+        return float(n_unequal) / size
+
+
+#------------------------------------------------------------
+# Canberra Distance
+#  D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ]
+cdef class CanberraDistance32(DistanceMetric32):
+    r"""Canberra Distance
+
+    Canberra distance is meant for discrete-valued vectors, though it is
+    a valid metric for real-valued vectors.
+
+    .. math::
+       D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef DTYPE_t denom, d = 0
+        cdef np.intp_t j
+        for j in range(size):
+            denom = <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
+            if denom > 0:
+                d += <DTYPE_t>(fabs(x1[j] - x2[j])) / denom
+        return d
+
+
+#------------------------------------------------------------
+# Bray-Curtis Distance
+#  D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)]
+cdef class BrayCurtisDistance32(DistanceMetric32):
+    r"""Bray-Curtis Distance
+
+    Bray-Curtis distance is meant for discrete-valued vectors, though it is
+    a valid metric for real-valued vectors.
+
+    .. math::
+       D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef DTYPE_t num = 0, denom = 0
+        cdef np.intp_t j
+        for j in range(size):
+            num += <DTYPE_t> fabs(x1[j] - x2[j])
+            denom += <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
+        if denom > 0:
+            return num / denom
+        else:
+            return 0.0
+
+
+#------------------------------------------------------------
+# Jaccard Distance (boolean)
+#  D(x, y) = N_unequal(x, y) / N_nonzero(x, y)
+cdef class JaccardDistance32(DistanceMetric32):
+    r"""Jaccard Distance
+
+    Jaccard Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, n_eq = 0, nnz = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            nnz += (tf1 or tf2)
+            n_eq += (tf1 and tf2)
+        # Based on https://github.com/scipy/scipy/pull/7373
+        # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
+        # was changed to return 0, instead of nan.
+        if nnz == 0:
+            return 0
+        return (nnz - n_eq) * 1.0 / nnz
+
+
+#------------------------------------------------------------
+# Matching Distance (boolean)
+#  D(x, y) = n_neq / n
+cdef class MatchingDistance32(DistanceMetric32):
+    r"""Matching Distance
+
+    Matching Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, n_neq = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+        return n_neq * 1. / size
+
+
+#------------------------------------------------------------
+# Dice Distance (boolean)
+#  D(x, y) = n_neq / (2 * ntt + n_neq)
+cdef class DiceDistance32(DistanceMetric32):
+    r"""Dice Distance
+
+    Dice Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, n_neq = 0, ntt = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            ntt += (tf1 and tf2)
+            n_neq += (tf1 != tf2)
+        return n_neq / (2.0 * ntt + n_neq)
+
+
+#------------------------------------------------------------
+# Kulsinski Distance (boolean)
+#  D(x, y) = (ntf + nft - ntt + n) / (n_neq + n)
+cdef class KulsinskiDistance32(DistanceMetric32):
+    r"""Kulsinski Distance
+
+    Kulsinski Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, ntt = 0, n_neq = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+            ntt += (tf1 and tf2)
+        return (n_neq - ntt + size) * 1.0 / (n_neq + size)
+
+
+#------------------------------------------------------------
+# Rogers-Tanimoto Distance (boolean)
+#  D(x, y) = 2 * n_neq / (n + n_neq)
+cdef class RogersTanimotoDistance32(DistanceMetric32):
+    r"""Rogers-Tanimoto Distance
+
+    Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, n_neq = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+        return (2.0 * n_neq) / (size + n_neq)
+
+
+#------------------------------------------------------------
+# Russell-Rao Distance (boolean)
+#  D(x, y) = (n - ntt) / n
+cdef class RussellRaoDistance32(DistanceMetric32):
+    r"""Russell-Rao Distance
+
+    Russell-Rao Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, ntt = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            ntt += (tf1 and tf2)
+        return (size - ntt) * 1. / size
+
+
+#------------------------------------------------------------
+# Sokal-Michener Distance (boolean)
+#  D(x, y) = 2 * n_neq / (n + n_neq)
+cdef class SokalMichenerDistance32(DistanceMetric32):
+    r"""Sokal-Michener Distance
+
+    Sokal-Michener Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, n_neq = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+        return (2.0 * n_neq) / (size + n_neq)
+
+
+#------------------------------------------------------------
+# Sokal-Sneath Distance (boolean)
+#  D(x, y) = n_neq / (0.5 * n_tt + n_neq)
+cdef class SokalSneathDistance32(DistanceMetric32):
+    r"""Sokal-Sneath Distance
+
+    Sokal-Sneath Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, ntt = 0, n_neq = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+            ntt += (tf1 and tf2)
+        return n_neq / (0.5 * ntt + n_neq)
+
+
+#------------------------------------------------------------
+# Haversine Distance (2 dimensional)
+#  D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2)
+#                          + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]}
+cdef class HaversineDistance32(DistanceMetric32):
+    """Haversine (Spherical) Distance
+
+    The Haversine distance is the angular distance between two points on
+    the surface of a sphere.  The first distance of each point is assumed
+    to be the latitude, the second is the longitude, given in radians.
+    The dimension of the points must be 2:
+
+    """
+
+    def _validate_data(self, X):
+        if X.shape[1] != 2:
+            raise ValueError("Haversine distance only valid "
+                             "in 2 dimensions")
+
+    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
+                              ITYPE_t size) nogil except -1:
+        cdef DTYPE_t sin_0 = <DTYPE_t> sin(0.5 * (x1[0] - x2[0]))
+        cdef DTYPE_t sin_1 = <DTYPE_t> sin(0.5 * (x1[1] - x2[1]))
+        return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
+
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        return 2 * asin(sqrt(self.rdist(x1, x2, size)))
+
+    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
+        return 2 * asin(sqrt(rdist))
+
+    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
+        cdef DTYPE_t tmp = <DTYPE_t> sin(0.5 * dist)
+        return tmp * tmp
+
+    def rdist_to_dist(self, rdist):
+        return 2 * np.arcsin(np.sqrt(rdist))
+
+    def dist_to_rdist(self, dist):
+        tmp = np.sin(0.5 * dist)
+        return tmp * tmp
+
+#------------------------------------------------------------
+# User-defined distance
+#
+cdef class PyFuncDistance32(DistanceMetric32):
+    """PyFunc Distance
+
+    A user-defined distance
+
+    Parameters
+    ----------
+    func : function
+        func should take two numpy arrays as input, and return a distance.
+    """
+    def __init__(self, func, **kwargs):
+        self.func = func
+        self.kwargs = kwargs
+
+    # in cython < 0.26, GIL was required to be acquired during definition of
+    # the function and inside the body of the function. This behaviour is not
+    # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
+    # only way to be back compatible is to inherit `dist` from the base class
+    # without GIL and called an inline `_dist` which acquire GIL.
+    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                             ITYPE_t size) nogil except -1:
+        return self._dist(x1, x2, size)
+
+    cdef inline DTYPE_t _dist(self, const np.float32_t* x1, const np.float32_t* x2,
+                              ITYPE_t size) except -1 with gil:
+        cdef np.ndarray x1arr
+        cdef np.ndarray x2arr
+        x1arr = _buffer_to_ndarray32(x1, size)
+        x2arr = _buffer_to_ndarray32(x2, size)
+        d = self.func(x1arr, x2arr, **self.kwargs)
+        try:
+            # Cython generates code here that results in a TypeError
+            # if d is the wrong type.
+            return d
+        except TypeError:
+            raise TypeError("Custom distance function must accept two "
+                            "vectors and return a float.")
+
+
+######################################################################
+# Datasets Pair Classes
+cdef class DatasetsPair32:
+    """Abstract class which wraps a pair of datasets (X, Y).
+
+    This class allows computing distances between a single pair of rows of
+    of X and Y at a time given the pair of their indices (i, j). This class is
+    specialized for each metric thanks to the :func:`get_for` factory classmethod.
+
+    The handling of parallelization over chunks to compute the distances
+    and aggregation for several rows at a time is done in dedicated
+    subclasses of PairwiseDistancesReduction that in-turn rely on
+    subclasses of DatasetsPair for each pair of rows in the data. The goal
+    is to make it possible to decouple the generic parallelization and
+    aggregation logic from metric-specific computation as much as
+    possible.
+
+    X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
+    in subclasses.
+
+    This class avoids the overhead of dispatching distance computations
+    to :class:`sklearn.metrics.DistanceMetric` based on the physical
+    representation of the vectors (sparse vs. dense). It makes use of
+    cython.final to remove the overhead of dispatching method calls.
+
+    Parameters
+    ----------
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        str metric="euclidean",
+        dict metric_kwargs=None,
+    ) -> DatasetsPair32:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        metric : str, default='euclidean'
+            The distance metric to compute between rows of X and Y.
+            The default metric is a fast implementation of the Euclidean
+            metric. For a list of available metrics, see the documentation
+            of :class:`~sklearn.metrics.DistanceMetric`.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        Returns
+        -------
+        datasets_pair: DatasetsPair
+            The suited DatasetsPair implementation.
+        """
+        cdef:
+            DistanceMetric32 distance_metric = DistanceMetric32.get_metric(
+                metric,
+                **(metric_kwargs or {})
+            )
+
+        if not(X.dtype == Y.dtype == np.float32):
+            raise ValueError(
+                f"Datasets must be of np.float32 type. "
+                f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+            )
+
+        # Metric-specific checks that do not replace nor duplicate `check_array`.
+        distance_metric._validate_data(X)
+        distance_metric._validate_data(Y)
+
+        # TODO: dispatch to other dataset pairs for sparse support once available:
+        if issparse(X) or issparse(Y):
+            raise ValueError("Only dense datasets are supported for X and Y.")
+
+        return DenseDenseDatasetsPair32(X, Y, distance_metric)
+
+    def __init__(self, DistanceMetric32 distance_metric):
+        self.distance_metric = distance_metric
+
+    cdef ITYPE_t n_samples_X(self) nogil:
+        """Number of samples in X."""
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -999
+
+    cdef ITYPE_t n_samples_Y(self) nogil:
+        """Number of samples in Y."""
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -999
+
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.dist(i, j)
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -1
+
+@final
+cdef class DenseDenseDatasetsPair32(DatasetsPair32):
+    """Compute distances between row vectors of two arrays.
+
+    Parameters
+    ----------
+    X: ndarray of shape (n_samples_X, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    Y: ndarray of shape (n_samples_Y, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two row vectors of (X, Y).
+    """
+
+    def __init__(self, X, Y, DistanceMetric32 distance_metric):
+        super().__init__(distance_metric)
+        # Arrays have already been checked
+        self.X = X
+        self.Y = Y
+        self.d = X.shape[1]
+
+    @final
+    cdef ITYPE_t n_samples_X(self) nogil:
+        return self.X.shape[0]
+
+    @final
+    cdef ITYPE_t n_samples_Y(self) nogil:
+        return self.Y.shape[0]
+
+    @final
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.distance_metric.rdist(&self.X[i, 0],
+                                          &self.Y[j, 0],
+                                          self.d)
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.distance_metric.dist(&self.X[i, 0],
+                                         &self.Y[j, 0],
+                                         self.d)
diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp
new file mode 100644
index 0000000000000..672e3fce086c4
--- /dev/null
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -0,0 +1,1319 @@
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    #       name_suffix, bitness, DTYPE_t, DTYPE
+    #
+    #
+    # On the first hand, an empty string is use for `name_suffix`
+    # for the 64bit case as to still be able to expose the original
+    # 64bit implementation under the same API, namely `DistanceMetric`.
+    #
+    # On the other hand, '32' bit is use for `name_suffix`
+    # for the 32bit case to remove ambiguity and use `DistanceMetric32`,
+    # which is not publicly exposed.
+    #
+    # The metric mapping is adapted accordingly to route to the correct
+    # implementations.
+    #
+    # We also use 64bit types as defined in `sklearn.utils._typedefs`
+    # to maintain backward compatibility as the symbol level for extra
+    # safety.
+    #
+    ('', '64', 'DTYPE_t', 'DTYPE'),
+    ('32', '32', 'np.float32_t', 'np.float32')
+]
+
+}}
+# By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
+# written for the scikit-learn project
+# License: BSD
+
+import numpy as np
+cimport numpy as np
+from cython cimport final
+
+np.import_array()  # required in order to use C-API
+
+
+# First, define a function to get an ndarray from a memory buffer
+cdef extern from "arrayobject.h":
+    object PyArray_SimpleNewFromData(int nd, np.npy_intp* dims,
+                                     int typenum, void* data)
+
+from scipy.sparse import csr_matrix, issparse
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
+from ..utils._typedefs import DTYPE, ITYPE
+from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
+from ..utils import check_array
+
+cdef inline double fmax(double a, double b) nogil:
+    return max(a, b)
+
+
+######################################################################
+# newObj function
+#  this is a helper function for pickling
+def newObj(obj):
+    return obj.__new__(obj)
+
+
+BOOL_METRICS = [
+    "hamming",
+    "matching",
+    "jaccard",
+    "dice",
+    "kulsinski",
+    "rogerstanimoto",
+    "russellrao",
+    "sokalmichener",
+    "sokalsneath",
+]
+
+def get_valid_metric_ids(L):
+    """Given an iterable of metric class names or class identifiers,
+    return a list of metric IDs which map to those classes.
+
+    Example:
+    >>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance'])
+    >>> sorted(L)
+    ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan']
+    """
+    return [key for (key, val) in METRIC_MAPPING.items()
+            if (val.__name__ in L) or (val in L)]
+
+
+{{for name_suffix, bitness, DTYPE_t, DTYPE in implementation_specific_values}}
+
+######################################################################
+# metric mappings
+#  These map from metric id strings to class names
+METRIC_MAPPING{{name_suffix}} = {
+    'euclidean': EuclideanDistance{{name_suffix}},
+    'l2': EuclideanDistance{{name_suffix}},
+    'minkowski': MinkowskiDistance{{name_suffix}},
+    'p': MinkowskiDistance{{name_suffix}},
+    'manhattan': ManhattanDistance{{name_suffix}},
+    'cityblock': ManhattanDistance{{name_suffix}},
+    'l1': ManhattanDistance{{name_suffix}},
+    'chebyshev': ChebyshevDistance{{name_suffix}},
+    'infinity': ChebyshevDistance{{name_suffix}},
+    'seuclidean': SEuclideanDistance{{name_suffix}},
+    'mahalanobis': MahalanobisDistance{{name_suffix}},
+    'wminkowski': WMinkowskiDistance{{name_suffix}},
+    'hamming': HammingDistance{{name_suffix}},
+    'canberra': CanberraDistance{{name_suffix}},
+    'braycurtis': BrayCurtisDistance{{name_suffix}},
+    'matching': MatchingDistance{{name_suffix}},
+    'jaccard': JaccardDistance{{name_suffix}},
+    'dice': DiceDistance{{name_suffix}},
+    'kulsinski': KulsinskiDistance{{name_suffix}},
+    'rogerstanimoto': RogersTanimotoDistance{{name_suffix}},
+    'russellrao': RussellRaoDistance{{name_suffix}},
+    'sokalmichener': SokalMichenerDistance{{name_suffix}},
+    'sokalsneath': SokalSneathDistance{{name_suffix}},
+    'haversine': HaversineDistance{{name_suffix}},
+    'pyfunc': PyFuncDistance{{name_suffix}},
+}
+
+cdef inline np.ndarray _buffer_to_ndarray{{name_suffix}}(const {{DTYPE_t}}* x, np.npy_intp n):
+    # Wrap a memory buffer with an ndarray. Warning: this is not robust.
+    # In particular, if x is deallocated before the returned array goes
+    # out of scope, this could cause memory errors.  Since there is not
+    # a possibility of this for our use-case, this should be safe.
+
+    # Note: this Segfaults unless np.import_array() is called above
+    return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
+
+
+from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
+cdef {{DTYPE_t}} INF{{name_suffix}} = np.inf
+
+
+######################################################################
+# Distance Metric Classes
+cdef class DistanceMetric{{name_suffix}}:
+    """DistanceMetric class
+
+    This class provides a uniform interface to fast distance metric
+    functions.  The various metrics can be accessed via the :meth:`get_metric`
+    class method and the metric string identifier (see below).
+
+    Examples
+    --------
+    >>> from sklearn.metrics import DistanceMetric
+    >>> dist = DistanceMetric.get_metric('euclidean')
+    >>> X = [[0, 1, 2],
+             [3, 4, 5]]
+    >>> dist.pairwise(X)
+    array([[ 0.        ,  5.19615242],
+           [ 5.19615242,  0.        ]])
+
+    Available Metrics
+
+    The following lists the string metric identifiers and the associated
+    distance metric classes:
+
+    **Metrics intended for real-valued vector spaces:**
+
+    ==============  ====================  ========  ===============================
+    identifier      class name            args      distance function
+    --------------  --------------------  --------  -------------------------------
+    "euclidean"     EuclideanDistance     -         ``sqrt(sum((x - y)^2))``
+    "manhattan"     ManhattanDistance     -         ``sum(|x - y|)``
+    "chebyshev"     ChebyshevDistance     -         ``max(|x - y|)``
+    "minkowski"     MinkowskiDistance     p, w      ``sum(w * |x - y|^p)^(1/p)``
+    "wminkowski"    WMinkowskiDistance    p, w      ``sum(|w * (x - y)|^p)^(1/p)``
+    "seuclidean"    SEuclideanDistance    V         ``sqrt(sum((x - y)^2 / V))``
+    "mahalanobis"   MahalanobisDistance   V or VI   ``sqrt((x - y)' V^-1 (x - y))``
+    ==============  ====================  ========  ===============================
+
+    .. deprecated:: 1.1
+        `WMinkowskiDistance` is deprecated in version 1.1 and will be removed in version 1.3.
+        Use `MinkowskiDistance` instead. Note that in `MinkowskiDistance`, the weights are
+        applied to the absolute differences already raised to the p power. This is different from
+        `WMinkowskiDistance` where weights are applied to the absolute differences before raising
+        to the p power. The deprecation aims to remain consistent with SciPy 1.8 convention.
+
+    **Metrics intended for two-dimensional vector spaces:**  Note that the haversine
+    distance metric requires data in the form of [latitude, longitude] and both
+    inputs and outputs are in units of radians.
+
+    ============  ==================  ===============================================================
+    identifier    class name          distance function
+    ------------  ------------------  ---------------------------------------------------------------
+    "haversine"   HaversineDistance   ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))``
+    ============  ==================  ===============================================================
+
+
+    **Metrics intended for integer-valued vector spaces:**  Though intended
+    for integer-valued vectors, these are also valid metrics in the case of
+    real-valued vectors.
+
+    =============  ====================  ========================================
+    identifier     class name            distance function
+    -------------  --------------------  ----------------------------------------
+    "hamming"      HammingDistance       ``N_unequal(x, y) / N_tot``
+    "canberra"     CanberraDistance      ``sum(|x - y| / (|x| + |y|))``
+    "braycurtis"   BrayCurtisDistance    ``sum(|x - y|) / (sum(|x|) + sum(|y|))``
+    =============  ====================  ========================================
+
+    **Metrics intended for boolean-valued vector spaces:**  Any nonzero entry
+    is evaluated to "True".  In the listings below, the following
+    abbreviations are used:
+
+     - N  : number of dimensions
+     - NTT : number of dims in which both values are True
+     - NTF : number of dims in which the first value is True, second is False
+     - NFT : number of dims in which the first value is False, second is True
+     - NFF : number of dims in which both values are False
+     - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT
+     - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT
+
+    =================  =======================  ===============================
+    identifier         class name               distance function
+    -----------------  -----------------------  -------------------------------
+    "jaccard"          JaccardDistance          NNEQ / NNZ
+    "matching"         MatchingDistance         NNEQ / N
+    "dice"             DiceDistance             NNEQ / (NTT + NNZ)
+    "kulsinski"        KulsinskiDistance        (NNEQ + N - NTT) / (NNEQ + N)
+    "rogerstanimoto"   RogersTanimotoDistance   2 * NNEQ / (N + NNEQ)
+    "russellrao"       RussellRaoDistance       (N - NTT) / N
+    "sokalmichener"    SokalMichenerDistance    2 * NNEQ / (N + NNEQ)
+    "sokalsneath"      SokalSneathDistance      NNEQ / (NNEQ + 0.5 * NTT)
+    =================  =======================  ===============================
+
+    **User-defined distance:**
+
+    ===========    ===============    =======
+    identifier     class name         args
+    -----------    ---------------    -------
+    "pyfunc"       PyFuncDistance     func
+    ===========    ===============    =======
+
+    Here ``func`` is a function which takes two one-dimensional numpy
+    arrays, and returns a distance.  Note that in order to be used within
+    the BallTree, the distance must be a true metric:
+    i.e. it must satisfy the following properties
+
+    1) Non-negativity: d(x, y) >= 0
+    2) Identity: d(x, y) = 0 if and only if x == y
+    3) Symmetry: d(x, y) = d(y, x)
+    4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)
+
+    Because of the Python object overhead involved in calling the python
+    function, this will be fairly slow, but it will have the same
+    scaling as other distances.
+    """
+    def __cinit__(self):
+        self.p = 2
+        self.vec = np.zeros(1, dtype={{DTYPE}}, order='C')
+        self.mat = np.zeros((1, 1), dtype={{DTYPE}}, order='C')
+        self.size = 1
+
+    def __reduce__(self):
+        """
+        reduce method used for pickling
+        """
+        return (newObj, (self.__class__,), self.__getstate__())
+
+    def __getstate__(self):
+        """
+        get state for pickling
+        """
+        if self.__class__.__name__ == "PyFuncDistance{{name_suffix}}":
+            return (float(self.p), np.asarray(self.vec), np.asarray(self.mat), self.func, self.kwargs)
+        return (float(self.p), np.asarray(self.vec), np.asarray(self.mat))
+
+    def __setstate__(self, state):
+        """
+        set state for pickling
+        """
+        self.p = state[0]
+        self.vec = ReadonlyArrayWrapper(state[1])
+        self.mat = ReadonlyArrayWrapper(state[2])
+        if self.__class__.__name__ == "PyFuncDistance{{name_suffix}}":
+            self.func = state[3]
+            self.kwargs = state[4]
+        self.size = self.vec.shape[0]
+
+    @classmethod
+    def get_metric(cls, metric, **kwargs):
+        """Get the given distance metric from the string identifier.
+
+        See the docstring of DistanceMetric for a list of available metrics.
+
+        Parameters
+        ----------
+        metric : str or class name
+            The distance metric to use
+        **kwargs
+            additional arguments will be passed to the requested metric
+        """
+        if isinstance(metric, DistanceMetric{{name_suffix}}):
+            return metric
+
+        if callable(metric):
+            return PyFuncDistance{{name_suffix}}(metric, **kwargs)
+
+        # Map the metric string ID to the metric class
+        if isinstance(metric, type) and issubclass(metric, DistanceMetric{{name_suffix}}):
+            pass
+        else:
+            try:
+                metric = METRIC_MAPPING{{name_suffix}}[metric]
+            except:
+                raise ValueError("Unrecognized metric '%s'" % metric)
+
+        # In Minkowski special cases, return more efficient methods
+        if metric is MinkowskiDistance{{name_suffix}}:
+            p = kwargs.pop('p', 2)
+            w = kwargs.pop('w', None)
+            if p == 1 and w is None:
+                return ManhattanDistance{{name_suffix}}(**kwargs)
+            elif p == 2 and w is None:
+                return EuclideanDistance{{name_suffix}}(**kwargs)
+            elif np.isinf(p) and w is None:
+                return ChebyshevDistance{{name_suffix}}(**kwargs)
+            else:
+                return MinkowskiDistance{{name_suffix}}(p, w, **kwargs)
+        else:
+            return metric(**kwargs)
+
+    def __init__(self):
+        if self.__class__ is DistanceMetric{{name_suffix}}:
+            raise NotImplementedError("DistanceMetric{{name_suffix}} is an abstract class")
+
+    def _validate_data(self, X):
+        """Validate the input data.
+
+        This should be overridden in a base class if a specific input format
+        is required.
+        """
+        return
+
+    cdef DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                      ITYPE_t size) nogil except -1:
+        """Compute the distance between vectors x1 and x2
+
+        This should be overridden in a base class.
+        """
+        return -999
+
+    cdef DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                       ITYPE_t size) nogil except -1:
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2.
+
+        This can optionally be overridden in a base class.
+
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
+        """
+        return self.dist(x1, x2, size)
+
+    cdef int pdist(self, const {{DTYPE_t}}[:, ::1] X, {{DTYPE_t}}[:, ::1] D) except -1:
+        """compute the pairwise distances between points in X"""
+        cdef ITYPE_t i1, i2
+        for i1 in range(X.shape[0]):
+            for i2 in range(i1, X.shape[0]):
+                D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1])
+                D[i2, i1] = D[i1, i2]
+        return 0
+
+    cdef int cdist(self, const {{DTYPE_t}}[:, ::1] X, const {{DTYPE_t}}[:, ::1] Y,
+                   {{DTYPE_t}}[:, ::1] D) except -1:
+        """compute the cross-pairwise distances between arrays X and Y"""
+        cdef ITYPE_t i1, i2
+        if X.shape[1] != Y.shape[1]:
+            raise ValueError('X and Y must have the same second dimension')
+        for i1 in range(X.shape[0]):
+            for i2 in range(Y.shape[0]):
+                D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1])
+        return 0
+
+    cdef DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+        """Convert the rank-preserving surrogate distance to the distance"""
+        return rdist
+
+    cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        """Convert the distance to the rank-preserving surrogate distance"""
+        return dist
+
+    def rdist_to_dist(self, rdist):
+        """Convert the rank-preserving surrogate distance to the distance.
+
+        The surrogate distance is any measure that yields the same rank as the
+        distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
+
+        Parameters
+        ----------
+        rdist : double
+            Surrogate distance.
+
+        Returns
+        -------
+        double
+            True distance.
+        """
+        return rdist
+
+    def dist_to_rdist(self, dist):
+        """Convert the true distance to the rank-preserving surrogate distance.
+
+        The surrogate distance is any measure that yields the same rank as the
+        distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
+
+        Parameters
+        ----------
+        dist : double
+            True distance.
+
+        Returns
+        -------
+        double
+            Surrogate distance.
+        """
+        return dist
+
+    def pairwise(self, X, Y=None):
+        """Compute the pairwise distances between X and Y
+
+        This is a convenience routine for the sake of testing.  For many
+        metrics, the utilities in scipy.spatial.distance.cdist and
+        scipy.spatial.distance.pdist will be faster.
+
+        Parameters
+        ----------
+        X : array-like
+            Array of shape (Nx, D), representing Nx points in D dimensions.
+        Y : array-like (optional)
+            Array of shape (Ny, D), representing Ny points in D dimensions.
+            If not specified, then Y=X.
+
+        Returns
+        -------
+        dist : ndarray
+            The shape (Nx, Ny) array of pairwise distances between points in
+            X and Y.
+        """
+        cdef np.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Xarr
+        cdef np.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Yarr
+        cdef np.ndarray[{{DTYPE_t}}, ndim=2, mode='c'] Darr
+
+        Xarr = np.asarray(X, dtype={{DTYPE}}, order='C')
+        self._validate_data(Xarr)
+        if Y is None:
+            Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]),
+                         dtype={{DTYPE}}, order='C')
+            self.pdist(Xarr, Darr)
+        else:
+            Yarr = np.asarray(Y, dtype={{DTYPE}}, order='C')
+            self._validate_data(Yarr)
+            Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]),
+                         dtype={{DTYPE}}, order='C')
+            self.cdist(Xarr, Yarr, Darr)
+        return Darr
+
+
+#------------------------------------------------------------
+# Euclidean Distance
+#  d = sqrt(sum(x_i^2 - y_i^2))
+cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Euclidean Distance metric
+
+    .. math::
+       D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 }
+    """
+    def __init__(self):
+        self.p = 2
+
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        return euclidean_dist{{name_suffix}}(x1, x2, size)
+
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                              ITYPE_t size) nogil except -1:
+        return euclidean_rdist{{name_suffix}}(x1, x2, size)
+
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+        return sqrt(rdist)
+
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        return dist * dist
+
+    def rdist_to_dist(self, rdist):
+        return np.sqrt(rdist)
+
+    def dist_to_rdist(self, dist):
+        return dist ** 2
+
+
+#------------------------------------------------------------
+# SEuclidean Distance
+#  d = sqrt(sum((x_i - y_i2)^2 / v_i))
+cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Standardized Euclidean Distance metric
+
+    .. math::
+       D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} }
+    """
+    def __init__(self, V):
+        self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype={{DTYPE}}))
+        self.size = self.vec.shape[0]
+        self.p = 2
+
+    def _validate_data(self, X):
+        if X.shape[1] != self.size:
+            raise ValueError('SEuclidean dist: size of V does not match')
+
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                              ITYPE_t size) nogil except -1:
+        cdef DTYPE_t tmp, d=0
+        cdef np.intp_t j
+        for j in range(size):
+            tmp = <DTYPE_t> (x1[j] - x2[j])
+            d += <DTYPE_t> (tmp * tmp / self.vec[j])
+        return d
+
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        return sqrt(self.rdist(x1, x2, size))
+
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+        return sqrt(rdist)
+
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        return dist * dist
+
+    def rdist_to_dist(self, rdist):
+        return np.sqrt(rdist)
+
+    def dist_to_rdist(self, dist):
+        return dist ** 2
+
+
+#------------------------------------------------------------
+# Manhattan Distance
+#  d = sum(abs(x_i - y_i))
+cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Manhattan/City-block Distance metric
+
+    .. math::
+       D(x, y) = \sum_i |x_i - y_i|
+    """
+    def __init__(self):
+        self.p = 1
+
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef DTYPE_t d = 0
+        cdef np.intp_t j
+        for j in range(size):
+            d += <DTYPE_t> fabs(x1[j] - x2[j])
+        return d
+
+
+#------------------------------------------------------------
+# Chebyshev Distance
+#  d = max_i(abs(x_i - y_i))
+cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    """Chebyshev/Infinity Distance
+
+    .. math::
+       D(x, y) = max_i (|x_i - y_i|)
+
+    Examples
+    --------
+    >>> from sklearn.metrics.dist_metrics import DistanceMetric
+    >>> dist = DistanceMetric.get_metric('chebyshev')
+    >>> X = [[0, 1, 2],
+    ...      [3, 4, 5]]
+    >>> Y = [[-1, 0, 1],
+    ...      [3, 4, 5]]
+    >>> dist.pairwise(X, Y)
+    array([[1.732..., 5.196...],
+           [6.928..., 0....   ]])
+    """
+    def __init__(self):
+        self.p = INF{{name_suffix}}
+
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef DTYPE_t d = 0
+        cdef np.intp_t j
+        for j in range(size):
+            d = <DTYPE_t> fmax(d, fabs(x1[j] - x2[j]))
+        return d
+
+
+#------------------------------------------------------------
+# Minkowski Distance
+cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Minkowski Distance
+
+    .. math::
+        D(x, y) = {||u-v||}_p
+
+    when w is None.
+
+    Here is the more general expanded expression for the weighted case:
+
+    .. math::
+        D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p)
+
+    Parameters
+    ----------
+    p : int
+        The order of the p-norm of the difference (see above).
+    w : (N,) array-like (optional)
+        The weight vector.
+
+    Minkowski Distance requires p >= 1 and finite. For p = infinity,
+    use ChebyshevDistance.
+    Note that for p=1, ManhattanDistance is more efficient, and for
+    p=2, EuclideanDistance is more efficient.
+    """
+    def __init__(self, p, w=None):
+        if p < 1:
+            raise ValueError("p must be greater than 1")
+        elif np.isinf(p):
+            raise ValueError("MinkowskiDistance requires finite p. "
+                             "For p=inf, use ChebyshevDistance.")
+
+        self.p = p
+        if w is not None:
+            w_array = check_array(
+                w, ensure_2d=False, dtype={{DTYPE}}, input_name="w"
+            )
+            if (w_array < 0).any():
+                raise ValueError("w cannot contain negative weights")
+            self.vec = ReadonlyArrayWrapper(w_array)
+            self.size = self.vec.shape[0]
+        else:
+            self.vec = ReadonlyArrayWrapper(np.asarray([], dtype={{DTYPE}}))
+            self.size = 0
+
+    def _validate_data(self, X):
+        if self.size > 0 and X.shape[1] != self.size:
+            raise ValueError("MinkowskiDistance: the size of w must match "
+                             f"the number of features ({X.shape[1]}). "
+                             f"Currently len(w)={self.size}.")
+
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                              ITYPE_t size) nogil except -1:
+        cdef DTYPE_t d=0
+        cdef np.intp_t j
+        cdef bint has_w = self.size > 0
+        if has_w:
+            for j in range(size):
+                d += <DTYPE_t> (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p))
+        else:
+            for j in range(size):
+                d += <DTYPE_t> (pow(fabs(x1[j] - x2[j]), self.p))
+        return d
+
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
+
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+        return <DTYPE_t> pow(rdist, 1. / self.p)
+
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        return <DTYPE_t> pow(dist, self.p)
+
+    def rdist_to_dist(self, rdist):
+        return rdist ** (1. / self.p)
+
+    def dist_to_rdist(self, dist):
+        return dist ** self.p
+
+
+#------------------------------------------------------------
+# TODO: Remove in 1.3 - WMinkowskiDistance class
+# W-Minkowski Distance
+cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Weighted Minkowski Distance
+
+    .. math::
+       D(x, y) = [\sum_i |w_i * (x_i - y_i)|^p] ^ (1/p)
+
+    Weighted Minkowski Distance requires p >= 1 and finite.
+
+    Parameters
+    ----------
+    p : int
+        The order of the norm of the difference :math:`{||u-v||}_p`.
+    w : (N,) array-like
+        The weight vector.
+
+    """
+    def __init__(self, p, w):
+        from warnings import warn
+        warn("WMinkowskiDistance is deprecated in version 1.1 and will be "
+            "removed in version 1.3. Use MinkowskiDistance instead. Note "
+            "that in MinkowskiDistance, the weights are applied to the "
+            "absolute differences raised to the p power. This is different "
+            "from WMinkowskiDistance where weights are applied to the "
+            "absolute differences before raising to the p power. "
+            "The deprecation aims to remain consistent with SciPy 1.8 "
+            "convention.", FutureWarning)
+
+        if p < 1:
+            raise ValueError("p must be greater than 1")
+        elif np.isinf(p):
+            raise ValueError("WMinkowskiDistance requires finite p. "
+                             "For p=inf, use ChebyshevDistance.")
+        self.p = p
+        self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype={{DTYPE}}))
+        self.size = self.vec.shape[0]
+
+    def _validate_data(self, X):
+        if X.shape[1] != self.size:
+            raise ValueError('WMinkowskiDistance dist: '
+                             'size of w does not match')
+
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                              ITYPE_t size) nogil except -1:
+        cdef DTYPE_t d = 0
+        cdef np.intp_t j
+        for j in range(size):
+            d += <DTYPE_t> (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p))
+        return d
+
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
+
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+        return <DTYPE_t> pow(rdist, 1. / self.p)
+
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        return <DTYPE_t> pow(dist, self.p)
+
+    def rdist_to_dist(self, rdist):
+        return rdist ** (1. / self.p)
+
+    def dist_to_rdist(self, dist):
+        return dist ** self.p
+
+
+#------------------------------------------------------------
+# Mahalanobis Distance
+#  d = sqrt( (x - y)^T V^-1 (x - y) )
+cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    """Mahalanobis Distance
+
+    .. math::
+       D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) }
+
+    Parameters
+    ----------
+    V : array-like
+        Symmetric positive-definite covariance matrix.
+        The inverse of this matrix will be explicitly computed.
+    VI : array-like
+        optionally specify the inverse directly.  If VI is passed,
+        then V is not referenced.
+    """
+    def __init__(self, V=None, VI=None):
+        if VI is None:
+            if V is None:
+                raise ValueError("Must provide either V or VI "
+                                 "for Mahalanobis distance")
+            VI = np.linalg.inv(V)
+        if VI.ndim != 2 or VI.shape[0] != VI.shape[1]:
+            raise ValueError("V/VI must be square")
+
+        self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype={{DTYPE}}, order='C'))
+
+        self.size = self.mat.shape[0]
+
+        # we need vec as a work buffer
+        self.vec = np.zeros(self.size, dtype={{DTYPE}})
+
+    def _validate_data(self, X):
+        if X.shape[1] != self.size:
+            raise ValueError('Mahalanobis dist: size of V does not match')
+
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                              ITYPE_t size) nogil except -1:
+        cdef DTYPE_t tmp, d = 0
+        cdef np.intp_t i, j
+
+        # compute (x1 - x2).T * VI * (x1 - x2)
+        for i in range(size):
+            self.vec[i] = x1[i] - x2[i]
+
+        for i in range(size):
+            tmp = 0
+            for j in range(size):
+                tmp += self.mat[i, j] * self.vec[j]
+            d += tmp * self.vec[i]
+        return d
+
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        return sqrt(self.rdist(x1, x2, size))
+
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+        return sqrt(rdist)
+
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        return dist * dist
+
+    def rdist_to_dist(self, rdist):
+        return np.sqrt(rdist)
+
+    def dist_to_rdist(self, dist):
+        return dist ** 2
+
+
+#------------------------------------------------------------
+# Hamming Distance
+#  d = N_unequal(x, y) / N_tot
+cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Hamming Distance
+
+    Hamming distance is meant for discrete-valued vectors, though it is
+    a valid metric for real-valued vectors.
+
+    .. math::
+       D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int n_unequal = 0
+        cdef np.intp_t j
+        for j in range(size):
+            if x1[j] != x2[j]:
+                n_unequal += 1
+        return float(n_unequal) / size
+
+
+#------------------------------------------------------------
+# Canberra Distance
+#  D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ]
+cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Canberra Distance
+
+    Canberra distance is meant for discrete-valued vectors, though it is
+    a valid metric for real-valued vectors.
+
+    .. math::
+       D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef DTYPE_t denom, d = 0
+        cdef np.intp_t j
+        for j in range(size):
+            denom = <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
+            if denom > 0:
+                d += <DTYPE_t>(fabs(x1[j] - x2[j])) / denom
+        return d
+
+
+#------------------------------------------------------------
+# Bray-Curtis Distance
+#  D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)]
+cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Bray-Curtis Distance
+
+    Bray-Curtis distance is meant for discrete-valued vectors, though it is
+    a valid metric for real-valued vectors.
+
+    .. math::
+       D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef DTYPE_t num = 0, denom = 0
+        cdef np.intp_t j
+        for j in range(size):
+            num += <DTYPE_t> fabs(x1[j] - x2[j])
+            denom += <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
+        if denom > 0:
+            return num / denom
+        else:
+            return 0.0
+
+
+#------------------------------------------------------------
+# Jaccard Distance (boolean)
+#  D(x, y) = N_unequal(x, y) / N_nonzero(x, y)
+cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Jaccard Distance
+
+    Jaccard Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, n_eq = 0, nnz = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            nnz += (tf1 or tf2)
+            n_eq += (tf1 and tf2)
+        # Based on https://github.com/scipy/scipy/pull/7373
+        # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
+        # was changed to return 0, instead of nan.
+        if nnz == 0:
+            return 0
+        return (nnz - n_eq) * 1.0 / nnz
+
+
+#------------------------------------------------------------
+# Matching Distance (boolean)
+#  D(x, y) = n_neq / n
+cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Matching Distance
+
+    Matching Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, n_neq = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+        return n_neq * 1. / size
+
+
+#------------------------------------------------------------
+# Dice Distance (boolean)
+#  D(x, y) = n_neq / (2 * ntt + n_neq)
+cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Dice Distance
+
+    Dice Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, n_neq = 0, ntt = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            ntt += (tf1 and tf2)
+            n_neq += (tf1 != tf2)
+        return n_neq / (2.0 * ntt + n_neq)
+
+
+#------------------------------------------------------------
+# Kulsinski Distance (boolean)
+#  D(x, y) = (ntf + nft - ntt + n) / (n_neq + n)
+cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Kulsinski Distance
+
+    Kulsinski Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, ntt = 0, n_neq = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+            ntt += (tf1 and tf2)
+        return (n_neq - ntt + size) * 1.0 / (n_neq + size)
+
+
+#------------------------------------------------------------
+# Rogers-Tanimoto Distance (boolean)
+#  D(x, y) = 2 * n_neq / (n + n_neq)
+cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Rogers-Tanimoto Distance
+
+    Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, n_neq = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+        return (2.0 * n_neq) / (size + n_neq)
+
+
+#------------------------------------------------------------
+# Russell-Rao Distance (boolean)
+#  D(x, y) = (n - ntt) / n
+cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Russell-Rao Distance
+
+    Russell-Rao Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, ntt = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            ntt += (tf1 and tf2)
+        return (size - ntt) * 1. / size
+
+
+#------------------------------------------------------------
+# Sokal-Michener Distance (boolean)
+#  D(x, y) = 2 * n_neq / (n + n_neq)
+cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Sokal-Michener Distance
+
+    Sokal-Michener Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, n_neq = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+        return (2.0 * n_neq) / (size + n_neq)
+
+
+#------------------------------------------------------------
+# Sokal-Sneath Distance (boolean)
+#  D(x, y) = n_neq / (0.5 * n_tt + n_neq)
+cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    r"""Sokal-Sneath Distance
+
+    Sokal-Sneath Distance is a dissimilarity measure for boolean-valued
+    vectors. All nonzero entries will be treated as True, zero entries will
+    be treated as False.
+
+    """
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        cdef int tf1, tf2, ntt = 0, n_neq = 0
+        cdef np.intp_t j
+        for j in range(size):
+            tf1 = x1[j] != 0
+            tf2 = x2[j] != 0
+            n_neq += (tf1 != tf2)
+            ntt += (tf1 and tf2)
+        return n_neq / (0.5 * ntt + n_neq)
+
+
+#------------------------------------------------------------
+# Haversine Distance (2 dimensional)
+#  D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2)
+#                          + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]}
+cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    """Haversine (Spherical) Distance
+
+    The Haversine distance is the angular distance between two points on
+    the surface of a sphere.  The first distance of each point is assumed
+    to be the latitude, the second is the longitude, given in radians.
+    The dimension of the points must be 2:
+
+    """
+
+    def _validate_data(self, X):
+        if X.shape[1] != 2:
+            raise ValueError("Haversine distance only valid "
+                             "in 2 dimensions")
+
+    cdef inline DTYPE_t rdist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                              ITYPE_t size) nogil except -1:
+        cdef DTYPE_t sin_0 = <DTYPE_t> sin(0.5 * (x1[0] - x2[0]))
+        cdef DTYPE_t sin_1 = <DTYPE_t> sin(0.5 * (x1[1] - x2[1]))
+        return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
+
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        return 2 * asin(sqrt(self.rdist(x1, x2, size)))
+
+    cdef inline DTYPE_t _rdist_to_dist(self, {{DTYPE_t}} rdist) nogil except -1:
+        return 2 * asin(sqrt(rdist))
+
+    cdef inline DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1:
+        cdef DTYPE_t tmp = <DTYPE_t> sin(0.5 * dist)
+        return tmp * tmp
+
+    def rdist_to_dist(self, rdist):
+        return 2 * np.arcsin(np.sqrt(rdist))
+
+    def dist_to_rdist(self, dist):
+        tmp = np.sin(0.5 * dist)
+        return tmp * tmp
+
+#------------------------------------------------------------
+# User-defined distance
+#
+cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
+    """PyFunc Distance
+
+    A user-defined distance
+
+    Parameters
+    ----------
+    func : function
+        func should take two numpy arrays as input, and return a distance.
+    """
+    def __init__(self, func, **kwargs):
+        self.func = func
+        self.kwargs = kwargs
+
+    # in cython < 0.26, GIL was required to be acquired during definition of
+    # the function and inside the body of the function. This behaviour is not
+    # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
+    # only way to be back compatible is to inherit `dist` from the base class
+    # without GIL and called an inline `_dist` which acquire GIL.
+    cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                             ITYPE_t size) nogil except -1:
+        return self._dist(x1, x2, size)
+
+    cdef inline DTYPE_t _dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
+                              ITYPE_t size) except -1 with gil:
+        cdef np.ndarray x1arr
+        cdef np.ndarray x2arr
+        x1arr = _buffer_to_ndarray{{name_suffix}}(x1, size)
+        x2arr = _buffer_to_ndarray{{name_suffix}}(x2, size)
+        d = self.func(x1arr, x2arr, **self.kwargs)
+        try:
+            # Cython generates code here that results in a TypeError
+            # if d is the wrong type.
+            return d
+        except TypeError:
+            raise TypeError("Custom distance function must accept two "
+                            "vectors and return a float.")
+
+
+######################################################################
+# Datasets Pair Classes
+cdef class DatasetsPair{{name_suffix}}:
+    """Abstract class which wraps a pair of datasets (X, Y).
+
+    This class allows computing distances between a single pair of rows of
+    of X and Y at a time given the pair of their indices (i, j). This class is
+    specialized for each metric thanks to the :func:`get_for` factory classmethod.
+
+    The handling of parallelization over chunks to compute the distances
+    and aggregation for several rows at a time is done in dedicated
+    subclasses of PairwiseDistancesReduction that in-turn rely on
+    subclasses of DatasetsPair for each pair of rows in the data. The goal
+    is to make it possible to decouple the generic parallelization and
+    aggregation logic from metric-specific computation as much as
+    possible.
+
+    X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
+    in subclasses.
+
+    This class avoids the overhead of dispatching distance computations
+    to :class:`sklearn.metrics.DistanceMetric` based on the physical
+    representation of the vectors (sparse vs. dense). It makes use of
+    cython.final to remove the overhead of dispatching method calls.
+
+    Parameters
+    ----------
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        str metric="euclidean",
+        dict metric_kwargs=None,
+    ) -> DatasetsPair{{name_suffix}}:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        metric : str, default='euclidean'
+            The distance metric to compute between rows of X and Y.
+            The default metric is a fast implementation of the Euclidean
+            metric. For a list of available metrics, see the documentation
+            of :class:`~sklearn.metrics.DistanceMetric`.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        Returns
+        -------
+        datasets_pair: DatasetsPair
+            The suited DatasetsPair implementation.
+        """
+        cdef:
+            DistanceMetric{{name_suffix}} distance_metric = DistanceMetric{{name_suffix}}.get_metric(
+                metric,
+                **(metric_kwargs or {})
+            )
+
+        if not(X.dtype == Y.dtype == np.float{{bitness}}):
+            raise ValueError(
+                f"Datasets must be of np.float{{bitness}} type. "
+                f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+            )
+
+        # Metric-specific checks that do not replace nor duplicate `check_array`.
+        distance_metric._validate_data(X)
+        distance_metric._validate_data(Y)
+
+        # TODO: dispatch to other dataset pairs for sparse support once available:
+        if issparse(X) or issparse(Y):
+            raise ValueError("Only dense datasets are supported for X and Y.")
+
+        return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+
+    def __init__(self, DistanceMetric{{name_suffix}} distance_metric):
+        self.distance_metric = distance_metric
+
+    cdef ITYPE_t n_samples_X(self) nogil:
+        """Number of samples in X."""
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -999
+
+    cdef ITYPE_t n_samples_Y(self) nogil:
+        """Number of samples in Y."""
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -999
+
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.dist(i, j)
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -1
+
+@final
+cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    """Compute distances between row vectors of two arrays.
+
+    Parameters
+    ----------
+    X: ndarray of shape (n_samples_X, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    Y: ndarray of shape (n_samples_Y, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two row vectors of (X, Y).
+    """
+
+    def __init__(self, X, Y, DistanceMetric{{name_suffix}} distance_metric):
+        super().__init__(distance_metric)
+        # Arrays have already been checked
+        self.X = X
+        self.Y = Y
+        self.d = X.shape[1]
+
+    @final
+    cdef ITYPE_t n_samples_X(self) nogil:
+        return self.X.shape[0]
+
+    @final
+    cdef ITYPE_t n_samples_Y(self) nogil:
+        return self.Y.shape[0]
+
+    @final
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.distance_metric.rdist(&self.X[i, 0],
+                                          &self.Y[j, 0],
+                                          self.d)
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.distance_metric.dist(&self.X[i, 0],
+                                         &self.Y[j, 0],
+                                         self.d)
+
+{{endfor}}
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 736ba6d7d4424..fc912068cb6c4 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -3,6 +3,8 @@
 
 from numpy.distutils.misc_util import Configuration
 
+from sklearn._build_utils import gen_from_templates
+
 
 def configuration(parent_package="", top_path=None):
     config = Configuration("metrics", parent_package, top_path)
@@ -19,6 +21,13 @@ def configuration(parent_package="", top_path=None):
         "_pairwise_fast", sources=["_pairwise_fast.pyx"], libraries=libraries
     )
 
+    templates = [
+        "sklearn/metrics/_dist_metrics.pyx.tp",
+        "sklearn/metrics/_dist_metrics.pxd.tp",
+    ]
+
+    gen_from_templates(templates)
+
     config.add_extension(
         "_dist_metrics",
         sources=["_dist_metrics.pyx"],
diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
index 6c841d1d44f8c..1a24afe2b2b30 100644
--- a/sklearn/metrics/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -10,7 +10,13 @@
 import scipy.sparse as sp
 from scipy.spatial.distance import cdist
 from sklearn.metrics import DistanceMetric
-from sklearn.metrics._dist_metrics import BOOL_METRICS
+
+from sklearn.metrics._dist_metrics import (
+    BOOL_METRICS,
+    # Unexposed private DistanceMetric for 32 bit
+    DistanceMetric32,
+)
+
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import create_memmap_backed_data
 from sklearn.utils.fixes import sp_version, parse_version
@@ -24,16 +30,18 @@ def dist_func(x1, x2, p):
 d = 4
 n1 = 20
 n2 = 25
-X1 = rng.random_sample((n1, d)).astype("float64", copy=False)
-X2 = rng.random_sample((n2, d)).astype("float64", copy=False)
+X64 = rng.random_sample((n1, d)).astype("float64", copy=False)
+Y64 = rng.random_sample((n2, d)).astype("float64", copy=False)
+X32 = X64.astype("float32")
+Y32 = Y64.astype("float32")
 
-[X1_mmap, X2_mmap] = create_memmap_backed_data([X1, X2])
+[X_mmap, Y_mmap] = create_memmap_backed_data([X64, Y64])
 
 # make boolean arrays: ones and zeros
-X1_bool = X1.round(0)
-X2_bool = X2.round(0)
+X_bool = X64.round(0)
+Y_bool = Y64.round(0)
 
-[X1_bool_mmap, X2_bool_mmap] = create_memmap_backed_data([X1_bool, X2_bool])
+[X_bool_mmap, Y_bool_mmap] = create_memmap_backed_data([X_bool, Y_bool])
 
 
 V = rng.random_sample((d, d))
@@ -65,27 +73,14 @@ def dist_func(x1, x2, p):
     )
 
 
-def check_cdist(metric, kwargs, X1, X2):
-    if metric == "wminkowski":
-        # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
-        WarningToExpect = None
-        if sp_version >= parse_version("1.6.0"):
-            WarningToExpect = DeprecationWarning
-        with pytest.warns(WarningToExpect):
-            D_scipy_cdist = cdist(X1, X2, metric, **kwargs)
-    else:
-        D_scipy_cdist = cdist(X1, X2, metric, **kwargs)
-
-    dm = DistanceMetric.get_metric(metric, **kwargs)
-    D_sklearn = dm.pairwise(X1, X2)
-    assert_array_almost_equal(D_sklearn, D_scipy_cdist)
-
-
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
-@pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)])
-def test_cdist(metric_param_grid, X1, X2):
+@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)])
+def test_cdist(metric_param_grid, X, Y):
+    DistanceMetricInterface = (
+        DistanceMetric if X.dtype == Y.dtype == np.float64 else DistanceMetric32
+    )
     metric, param_grid = metric_param_grid
     keys = param_grid.keys()
     for vals in itertools.product(*param_grid.values()):
@@ -96,29 +91,41 @@ def test_cdist(metric_param_grid, X1, X2):
             pytest.xfail(
                 "scipy#13861: cdist with 'mahalanobis' fails on joblib memmap data"
             )
-        check_cdist(metric, kwargs, X1, X2)
+
+        if metric == "wminkowski":
+            # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
+            WarningToExpect = None
+            if sp_version >= parse_version("1.6.0"):
+                WarningToExpect = DeprecationWarning
+            with pytest.warns(WarningToExpect):
+                D_scipy_cdist = cdist(X, Y, metric, **kwargs)
+        else:
+            D_scipy_cdist = cdist(X, Y, metric, **kwargs)
+
+        dm = DistanceMetricInterface.get_metric(metric, **kwargs)
+        D_sklearn = dm.pairwise(X, Y)
+        assert_array_almost_equal(D_sklearn, D_scipy_cdist)
 
 
 @pytest.mark.parametrize("metric", BOOL_METRICS)
 @pytest.mark.parametrize(
-    "X1_bool, X2_bool", [(X1_bool, X2_bool), (X1_bool_mmap, X2_bool_mmap)]
+    "X_bool, Y_bool", [(X_bool, Y_bool), (X_bool_mmap, Y_bool_mmap)]
 )
-def test_cdist_bool_metric(metric, X1_bool, X2_bool):
-    D_true = cdist(X1_bool, X2_bool, metric)
-    check_cdist_bool(metric, D_true)
-
-
-def check_cdist_bool(metric, D_true):
+def test_cdist_bool_metric(metric, X_bool, Y_bool):
+    D_true = cdist(X_bool, Y_bool, metric)
     dm = DistanceMetric.get_metric(metric)
-    D12 = dm.pairwise(X1_bool, X2_bool)
+    D12 = dm.pairwise(X_bool, Y_bool)
     assert_array_almost_equal(D12, D_true)
 
 
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
-@pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)])
-def test_pdist(metric_param_grid, X1, X2):
+@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)])
+def test_pdist(metric_param_grid, X, Y):
+    DistanceMetricInterface = (
+        DistanceMetric if X.dtype == Y.dtype == np.float64 else DistanceMetric32
+    )
     metric, param_grid = metric_param_grid
     keys = param_grid.keys()
     for vals in itertools.product(*param_grid.values()):
@@ -135,29 +142,21 @@ def test_pdist(metric_param_grid, X1, X2):
             if sp_version >= parse_version("1.6.0"):
                 ExceptionToAssert = DeprecationWarning
             with pytest.warns(ExceptionToAssert):
-                D_true = cdist(X1, X1, metric, **kwargs)
+                D_true = cdist(X, X, metric, **kwargs)
         else:
-            D_true = cdist(X1, X1, metric, **kwargs)
+            D_true = cdist(X, X, metric, **kwargs)
 
-        check_pdist(metric, kwargs, D_true)
+        dm = DistanceMetricInterface.get_metric(metric, **kwargs)
+        D12 = dm.pairwise(X)
+        assert_array_almost_equal(D12, D_true)
 
 
 @pytest.mark.parametrize("metric", BOOL_METRICS)
-@pytest.mark.parametrize("X1_bool", [X1_bool, X1_bool_mmap])
-def test_pdist_bool_metrics(metric, X1_bool):
-    D_true = cdist(X1_bool, X1_bool, metric)
-    check_pdist_bool(metric, D_true)
-
-
-def check_pdist(metric, kwargs, D_true):
-    dm = DistanceMetric.get_metric(metric, **kwargs)
-    D12 = dm.pairwise(X1)
-    assert_array_almost_equal(D12, D_true)
-
-
-def check_pdist_bool(metric, D_true):
+@pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap])
+def test_pdist_bool_metrics(metric, X_bool):
+    D_true = cdist(X_bool, X_bool, metric)
     dm = DistanceMetric.get_metric(metric)
-    D12 = dm.pairwise(X1_bool)
+    D12 = dm.pairwise(X_bool)
     # Based on https://github.com/scipy/scipy/pull/7373
     # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
     # was changed to return 0, instead of nan.
@@ -170,7 +169,11 @@ def check_pdist_bool(metric, D_true):
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("writable_kwargs", [True, False])
 @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
-def test_pickle(writable_kwargs, metric_param_grid):
+@pytest.mark.parametrize("X", [X64, X32])
+def test_pickle(writable_kwargs, metric_param_grid, X):
+    DistanceMetricInterface = (
+        DistanceMetric if X.dtype == np.float64 else DistanceMetric32
+    )
     metric, param_grid = metric_param_grid
     keys = param_grid.keys()
     for vals in itertools.product(*param_grid.values()):
@@ -180,26 +183,22 @@ def test_pickle(writable_kwargs, metric_param_grid):
                 if isinstance(val, np.ndarray):
                     val.setflags(write=writable_kwargs)
         kwargs = dict(zip(keys, vals))
-        check_pickle(metric, kwargs)
+        dm = DistanceMetricInterface.get_metric(metric, **kwargs)
+        D1 = dm.pairwise(X)
+        dm2 = pickle.loads(pickle.dumps(dm))
+        D2 = dm2.pairwise(X)
+        assert_array_almost_equal(D1, D2)
 
 
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("metric", BOOL_METRICS)
-@pytest.mark.parametrize("X1_bool", [X1_bool, X1_bool_mmap])
-def test_pickle_bool_metrics(metric, X1_bool):
+@pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap])
+def test_pickle_bool_metrics(metric, X_bool):
     dm = DistanceMetric.get_metric(metric)
-    D1 = dm.pairwise(X1_bool)
-    dm2 = pickle.loads(pickle.dumps(dm))
-    D2 = dm2.pairwise(X1_bool)
-    assert_array_almost_equal(D1, D2)
-
-
-def check_pickle(metric, kwargs):
-    dm = DistanceMetric.get_metric(metric, **kwargs)
-    D1 = dm.pairwise(X1)
+    D1 = dm.pairwise(X_bool)
     dm2 = pickle.loads(pickle.dumps(dm))
-    D2 = dm2.pairwise(X1)
+    D2 = dm2.pairwise(X_bool)
     assert_array_almost_equal(D1, D2)
 
 
@@ -305,11 +304,11 @@ def test_minkowski_metric_validate_weights_size():
     dm = DistanceMetric.get_metric("minkowski", p=3, w=w2)
     msg = (
         "MinkowskiDistance: the size of w must match "
-        f"the number of features \\({X1.shape[1]}\\). "
+        f"the number of features \\({X64.shape[1]}\\). "
         f"Currently len\\(w\\)={w2.shape[0]}."
     )
     with pytest.raises(ValueError, match=msg):
-        dm.pairwise(X1, X2)
+        dm.pairwise(X64, Y64)
 
 
 # TODO: Remove in 1.3 when wminkowski is removed
@@ -328,6 +327,6 @@ def test_wminkowski_minkowski_equivalence(p):
     # Weights are rescaled for consistency w.r.t scipy 1.8 refactoring of 'minkowski'
     dm_wmks = DistanceMetric.get_metric("wminkowski", p=p, w=(w) ** (1 / p))
     dm_mks = DistanceMetric.get_metric("minkowski", p=p, w=w)
-    D_wmks = dm_wmks.pairwise(X1, X2)
-    D_mks = dm_mks.pairwise(X1, X2)
+    D_wmks = dm_wmks.pairwise(X64, Y64)
+    D_mks = dm_mks.pairwise(X64, Y64)
     assert_array_almost_equal(D_wmks, D_mks)

From dd50629af49b643d69b7b0de00068b11bd7e1ec8 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 17 Mar 2022 10:44:13 +0100
Subject: [PATCH 17/26] MAINT Remove generated pyx file

---
 sklearn/metrics/_dist_metrics.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index a64ea88f3b4a6..04324ec3ced98 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -15,7 +15,7 @@ cdef extern from "arrayobject.h":
                                      int typenum, void* data)
 
 from scipy.sparse import csr_matrix, issparse
-from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DTYPECODE
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
 from ..utils._typedefs import DTYPE, ITYPE
 from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
 from ..utils import check_array

From 545326fe44782fda7c9f69a2e6c7824c7e86c0d0 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 17 Mar 2022 11:03:26 +0100
Subject: [PATCH 18/26] MAINT Do not generate 32bit version of DatasetsPair for
 now

---
 sklearn/metrics/_dist_metrics.pxd.tp |   13 +-
 sklearn/metrics/_dist_metrics.pyx    | 2517 --------------------------
 sklearn/metrics/_dist_metrics.pyx.tp |   23 +-
 3 files changed, 17 insertions(+), 2536 deletions(-)
 delete mode 100644 sklearn/metrics/_dist_metrics.pyx

diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp
index ff3e01c66e564..8c9c960bd882e 100644
--- a/sklearn/metrics/_dist_metrics.pxd.tp
+++ b/sklearn/metrics/_dist_metrics.pxd.tp
@@ -94,11 +94,12 @@ cdef class DistanceMetric{{name_suffix}}:
 
     cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1
 
+{{endfor}}
 
 ######################################################################
 # DatasetsPair base class
-cdef class DatasetsPair{{name_suffix}}:
-    cdef DistanceMetric{{name_suffix}} distance_metric
+cdef class DatasetsPair:
+    cdef DistanceMetric distance_metric
 
     cdef ITYPE_t n_samples_X(self) nogil
 
@@ -109,10 +110,8 @@ cdef class DatasetsPair{{name_suffix}}:
     cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
 
 
-cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+cdef class DenseDenseDatasetsPair(DatasetsPair):
     cdef:
-        const {{DTYPE_t}}[:, ::1] X
-        const {{DTYPE_t}}[:, ::1] Y
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
         ITYPE_t d
-
-{{endfor}}
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
deleted file mode 100644
index 04324ec3ced98..0000000000000
--- a/sklearn/metrics/_dist_metrics.pyx
+++ /dev/null
@@ -1,2517 +0,0 @@
-# By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
-# written for the scikit-learn project
-# License: BSD
-
-import numpy as np
-cimport numpy as np
-from cython cimport final
-
-np.import_array()  # required in order to use C-API
-
-
-# First, define a function to get an ndarray from a memory buffer
-cdef extern from "arrayobject.h":
-    object PyArray_SimpleNewFromData(int nd, np.npy_intp* dims,
-                                     int typenum, void* data)
-
-from scipy.sparse import csr_matrix, issparse
-from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
-from ..utils._typedefs import DTYPE, ITYPE
-from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
-from ..utils import check_array
-
-cdef inline double fmax(double a, double b) nogil:
-    return max(a, b)
-
-
-######################################################################
-# newObj function
-#  this is a helper function for pickling
-def newObj(obj):
-    return obj.__new__(obj)
-
-
-BOOL_METRICS = [
-    "hamming",
-    "matching",
-    "jaccard",
-    "dice",
-    "kulsinski",
-    "rogerstanimoto",
-    "russellrao",
-    "sokalmichener",
-    "sokalsneath",
-]
-
-def get_valid_metric_ids(L):
-    """Given an iterable of metric class names or class identifiers,
-    return a list of metric IDs which map to those classes.
-
-    Example:
-    >>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance'])
-    >>> sorted(L)
-    ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan']
-    """
-    return [key for (key, val) in METRIC_MAPPING.items()
-            if (val.__name__ in L) or (val in L)]
-
-
-######################################################################
-# metric mappings
-#  These map from metric id strings to class names
-METRIC_MAPPING = {
-    'euclidean': EuclideanDistance,
-    'l2': EuclideanDistance,
-    'minkowski': MinkowskiDistance,
-    'p': MinkowskiDistance,
-    'manhattan': ManhattanDistance,
-    'cityblock': ManhattanDistance,
-    'l1': ManhattanDistance,
-    'chebyshev': ChebyshevDistance,
-    'infinity': ChebyshevDistance,
-    'seuclidean': SEuclideanDistance,
-    'mahalanobis': MahalanobisDistance,
-    'wminkowski': WMinkowskiDistance,
-    'hamming': HammingDistance,
-    'canberra': CanberraDistance,
-    'braycurtis': BrayCurtisDistance,
-    'matching': MatchingDistance,
-    'jaccard': JaccardDistance,
-    'dice': DiceDistance,
-    'kulsinski': KulsinskiDistance,
-    'rogerstanimoto': RogersTanimotoDistance,
-    'russellrao': RussellRaoDistance,
-    'sokalmichener': SokalMichenerDistance,
-    'sokalsneath': SokalSneathDistance,
-    'haversine': HaversineDistance,
-    'pyfunc': PyFuncDistance,
-}
-
-cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n):
-    # Wrap a memory buffer with an ndarray. Warning: this is not robust.
-    # In particular, if x is deallocated before the returned array goes
-    # out of scope, this could cause memory errors.  Since there is not
-    # a possibility of this for our use-case, this should be safe.
-
-    # Note: this Segfaults unless np.import_array() is called above
-    return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
-
-
-from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
-cdef DTYPE_t INF = np.inf
-
-
-######################################################################
-# Distance Metric Classes
-cdef class DistanceMetric:
-    """DistanceMetric class
-
-    This class provides a uniform interface to fast distance metric
-    functions.  The various metrics can be accessed via the :meth:`get_metric`
-    class method and the metric string identifier (see below).
-
-    Examples
-    --------
-    >>> from sklearn.metrics import DistanceMetric
-    >>> dist = DistanceMetric.get_metric('euclidean')
-    >>> X = [[0, 1, 2],
-             [3, 4, 5]]
-    >>> dist.pairwise(X)
-    array([[ 0.        ,  5.19615242],
-           [ 5.19615242,  0.        ]])
-
-    Available Metrics
-
-    The following lists the string metric identifiers and the associated
-    distance metric classes:
-
-    **Metrics intended for real-valued vector spaces:**
-
-    ==============  ====================  ========  ===============================
-    identifier      class name            args      distance function
-    --------------  --------------------  --------  -------------------------------
-    "euclidean"     EuclideanDistance     -         ``sqrt(sum((x - y)^2))``
-    "manhattan"     ManhattanDistance     -         ``sum(|x - y|)``
-    "chebyshev"     ChebyshevDistance     -         ``max(|x - y|)``
-    "minkowski"     MinkowskiDistance     p, w      ``sum(w * |x - y|^p)^(1/p)``
-    "wminkowski"    WMinkowskiDistance    p, w      ``sum(|w * (x - y)|^p)^(1/p)``
-    "seuclidean"    SEuclideanDistance    V         ``sqrt(sum((x - y)^2 / V))``
-    "mahalanobis"   MahalanobisDistance   V or VI   ``sqrt((x - y)' V^-1 (x - y))``
-    ==============  ====================  ========  ===============================
-
-    .. deprecated:: 1.1
-        `WMinkowskiDistance` is deprecated in version 1.1 and will be removed in version 1.3.
-        Use `MinkowskiDistance` instead. Note that in `MinkowskiDistance`, the weights are
-        applied to the absolute differences already raised to the p power. This is different from
-        `WMinkowskiDistance` where weights are applied to the absolute differences before raising
-        to the p power. The deprecation aims to remain consistent with SciPy 1.8 convention.
-
-    **Metrics intended for two-dimensional vector spaces:**  Note that the haversine
-    distance metric requires data in the form of [latitude, longitude] and both
-    inputs and outputs are in units of radians.
-
-    ============  ==================  ===============================================================
-    identifier    class name          distance function
-    ------------  ------------------  ---------------------------------------------------------------
-    "haversine"   HaversineDistance   ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))``
-    ============  ==================  ===============================================================
-
-
-    **Metrics intended for integer-valued vector spaces:**  Though intended
-    for integer-valued vectors, these are also valid metrics in the case of
-    real-valued vectors.
-
-    =============  ====================  ========================================
-    identifier     class name            distance function
-    -------------  --------------------  ----------------------------------------
-    "hamming"      HammingDistance       ``N_unequal(x, y) / N_tot``
-    "canberra"     CanberraDistance      ``sum(|x - y| / (|x| + |y|))``
-    "braycurtis"   BrayCurtisDistance    ``sum(|x - y|) / (sum(|x|) + sum(|y|))``
-    =============  ====================  ========================================
-
-    **Metrics intended for boolean-valued vector spaces:**  Any nonzero entry
-    is evaluated to "True".  In the listings below, the following
-    abbreviations are used:
-
-     - N  : number of dimensions
-     - NTT : number of dims in which both values are True
-     - NTF : number of dims in which the first value is True, second is False
-     - NFT : number of dims in which the first value is False, second is True
-     - NFF : number of dims in which both values are False
-     - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT
-     - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT
-
-    =================  =======================  ===============================
-    identifier         class name               distance function
-    -----------------  -----------------------  -------------------------------
-    "jaccard"          JaccardDistance          NNEQ / NNZ
-    "matching"         MatchingDistance         NNEQ / N
-    "dice"             DiceDistance             NNEQ / (NTT + NNZ)
-    "kulsinski"        KulsinskiDistance        (NNEQ + N - NTT) / (NNEQ + N)
-    "rogerstanimoto"   RogersTanimotoDistance   2 * NNEQ / (N + NNEQ)
-    "russellrao"       RussellRaoDistance       (N - NTT) / N
-    "sokalmichener"    SokalMichenerDistance    2 * NNEQ / (N + NNEQ)
-    "sokalsneath"      SokalSneathDistance      NNEQ / (NNEQ + 0.5 * NTT)
-    =================  =======================  ===============================
-
-    **User-defined distance:**
-
-    ===========    ===============    =======
-    identifier     class name         args
-    -----------    ---------------    -------
-    "pyfunc"       PyFuncDistance     func
-    ===========    ===============    =======
-
-    Here ``func`` is a function which takes two one-dimensional numpy
-    arrays, and returns a distance.  Note that in order to be used within
-    the BallTree, the distance must be a true metric:
-    i.e. it must satisfy the following properties
-
-    1) Non-negativity: d(x, y) >= 0
-    2) Identity: d(x, y) = 0 if and only if x == y
-    3) Symmetry: d(x, y) = d(y, x)
-    4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)
-
-    Because of the Python object overhead involved in calling the python
-    function, this will be fairly slow, but it will have the same
-    scaling as other distances.
-    """
-    def __cinit__(self):
-        self.p = 2
-        self.vec = np.zeros(1, dtype=DTYPE, order='C')
-        self.mat = np.zeros((1, 1), dtype=DTYPE, order='C')
-        self.size = 1
-
-    def __reduce__(self):
-        """
-        reduce method used for pickling
-        """
-        return (newObj, (self.__class__,), self.__getstate__())
-
-    def __getstate__(self):
-        """
-        get state for pickling
-        """
-        if self.__class__.__name__ == "PyFuncDistance":
-            return (float(self.p), np.asarray(self.vec), np.asarray(self.mat), self.func, self.kwargs)
-        return (float(self.p), np.asarray(self.vec), np.asarray(self.mat))
-
-    def __setstate__(self, state):
-        """
-        set state for pickling
-        """
-        self.p = state[0]
-        self.vec = ReadonlyArrayWrapper(state[1])
-        self.mat = ReadonlyArrayWrapper(state[2])
-        if self.__class__.__name__ == "PyFuncDistance":
-            self.func = state[3]
-            self.kwargs = state[4]
-        self.size = self.vec.shape[0]
-
-    @classmethod
-    def get_metric(cls, metric, **kwargs):
-        """Get the given distance metric from the string identifier.
-
-        See the docstring of DistanceMetric for a list of available metrics.
-
-        Parameters
-        ----------
-        metric : str or class name
-            The distance metric to use
-        **kwargs
-            additional arguments will be passed to the requested metric
-        """
-        if isinstance(metric, DistanceMetric):
-            return metric
-
-        if callable(metric):
-            return PyFuncDistance(metric, **kwargs)
-
-        # Map the metric string ID to the metric class
-        if isinstance(metric, type) and issubclass(metric, DistanceMetric):
-            pass
-        else:
-            try:
-                metric = METRIC_MAPPING[metric]
-            except:
-                raise ValueError("Unrecognized metric '%s'" % metric)
-
-        # In Minkowski special cases, return more efficient methods
-        if metric is MinkowskiDistance:
-            p = kwargs.pop('p', 2)
-            w = kwargs.pop('w', None)
-            if p == 1 and w is None:
-                return ManhattanDistance(**kwargs)
-            elif p == 2 and w is None:
-                return EuclideanDistance(**kwargs)
-            elif np.isinf(p) and w is None:
-                return ChebyshevDistance(**kwargs)
-            else:
-                return MinkowskiDistance(p, w, **kwargs)
-        else:
-            return metric(**kwargs)
-
-    def __init__(self):
-        if self.__class__ is DistanceMetric:
-            raise NotImplementedError("DistanceMetric is an abstract class")
-
-    def _validate_data(self, X):
-        """Validate the input data.
-
-        This should be overridden in a base class if a specific input format
-        is required.
-        """
-        return
-
-    cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                      ITYPE_t size) nogil except -1:
-        """Compute the distance between vectors x1 and x2
-
-        This should be overridden in a base class.
-        """
-        return -999
-
-    cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                       ITYPE_t size) nogil except -1:
-        """Compute the rank-preserving surrogate distance between vectors x1 and x2.
-
-        This can optionally be overridden in a base class.
-
-        The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute. For example, the
-        rank-preserving surrogate distance of the Euclidean metric is the
-        squared-euclidean distance.
-        """
-        return self.dist(x1, x2, size)
-
-    cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1:
-        """compute the pairwise distances between points in X"""
-        cdef ITYPE_t i1, i2
-        for i1 in range(X.shape[0]):
-            for i2 in range(i1, X.shape[0]):
-                D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1])
-                D[i2, i1] = D[i1, i2]
-        return 0
-
-    cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y,
-                   DTYPE_t[:, ::1] D) except -1:
-        """compute the cross-pairwise distances between arrays X and Y"""
-        cdef ITYPE_t i1, i2
-        if X.shape[1] != Y.shape[1]:
-            raise ValueError('X and Y must have the same second dimension')
-        for i1 in range(X.shape[0]):
-            for i2 in range(Y.shape[0]):
-                D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1])
-        return 0
-
-    cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        """Convert the rank-preserving surrogate distance to the distance"""
-        return rdist
-
-    cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        """Convert the distance to the rank-preserving surrogate distance"""
-        return dist
-
-    def rdist_to_dist(self, rdist):
-        """Convert the rank-preserving surrogate distance to the distance.
-
-        The surrogate distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute. For example, the
-        rank-preserving surrogate distance of the Euclidean metric is the
-        squared-euclidean distance.
-
-        Parameters
-        ----------
-        rdist : double
-            Surrogate distance.
-
-        Returns
-        -------
-        double
-            True distance.
-        """
-        return rdist
-
-    def dist_to_rdist(self, dist):
-        """Convert the true distance to the rank-preserving surrogate distance.
-
-        The surrogate distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute. For example, the
-        rank-preserving surrogate distance of the Euclidean metric is the
-        squared-euclidean distance.
-
-        Parameters
-        ----------
-        dist : double
-            True distance.
-
-        Returns
-        -------
-        double
-            Surrogate distance.
-        """
-        return dist
-
-    def pairwise(self, X, Y=None):
-        """Compute the pairwise distances between X and Y
-
-        This is a convenience routine for the sake of testing.  For many
-        metrics, the utilities in scipy.spatial.distance.cdist and
-        scipy.spatial.distance.pdist will be faster.
-
-        Parameters
-        ----------
-        X : array-like
-            Array of shape (Nx, D), representing Nx points in D dimensions.
-        Y : array-like (optional)
-            Array of shape (Ny, D), representing Ny points in D dimensions.
-            If not specified, then Y=X.
-
-        Returns
-        -------
-        dist : ndarray
-            The shape (Nx, Ny) array of pairwise distances between points in
-            X and Y.
-        """
-        cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Xarr
-        cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Yarr
-        cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Darr
-
-        Xarr = np.asarray(X, dtype=DTYPE, order='C')
-        self._validate_data(Xarr)
-        if Y is None:
-            Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]),
-                         dtype=DTYPE, order='C')
-            self.pdist(Xarr, Darr)
-        else:
-            Yarr = np.asarray(Y, dtype=DTYPE, order='C')
-            self._validate_data(Yarr)
-            Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]),
-                         dtype=DTYPE, order='C')
-            self.cdist(Xarr, Yarr, Darr)
-        return Darr
-
-
-#------------------------------------------------------------
-# Euclidean Distance
-#  d = sqrt(sum(x_i^2 - y_i^2))
-cdef class EuclideanDistance(DistanceMetric):
-    r"""Euclidean Distance metric
-
-    .. math::
-       D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 }
-    """
-    def __init__(self):
-        self.p = 2
-
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return euclidean_dist(x1, x2, size)
-
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                              ITYPE_t size) nogil except -1:
-        return euclidean_rdist(x1, x2, size)
-
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        return sqrt(rdist)
-
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        return dist * dist
-
-    def rdist_to_dist(self, rdist):
-        return np.sqrt(rdist)
-
-    def dist_to_rdist(self, dist):
-        return dist ** 2
-
-
-#------------------------------------------------------------
-# SEuclidean Distance
-#  d = sqrt(sum((x_i - y_i2)^2 / v_i))
-cdef class SEuclideanDistance(DistanceMetric):
-    r"""Standardized Euclidean Distance metric
-
-    .. math::
-       D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} }
-    """
-    def __init__(self, V):
-        self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype=DTYPE))
-        self.size = self.vec.shape[0]
-        self.p = 2
-
-    def _validate_data(self, X):
-        if X.shape[1] != self.size:
-            raise ValueError('SEuclidean dist: size of V does not match')
-
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t tmp, d=0
-        cdef np.intp_t j
-        for j in range(size):
-            tmp = <DTYPE_t> (x1[j] - x2[j])
-            d += <DTYPE_t> (tmp * tmp / self.vec[j])
-        return d
-
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return sqrt(self.rdist(x1, x2, size))
-
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        return sqrt(rdist)
-
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        return dist * dist
-
-    def rdist_to_dist(self, rdist):
-        return np.sqrt(rdist)
-
-    def dist_to_rdist(self, dist):
-        return dist ** 2
-
-
-#------------------------------------------------------------
-# Manhattan Distance
-#  d = sum(abs(x_i - y_i))
-cdef class ManhattanDistance(DistanceMetric):
-    r"""Manhattan/City-block Distance metric
-
-    .. math::
-       D(x, y) = \sum_i |x_i - y_i|
-    """
-    def __init__(self):
-        self.p = 1
-
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d = 0
-        cdef np.intp_t j
-        for j in range(size):
-            d += <DTYPE_t> fabs(x1[j] - x2[j])
-        return d
-
-
-#------------------------------------------------------------
-# Chebyshev Distance
-#  d = max_i(abs(x_i - y_i))
-cdef class ChebyshevDistance(DistanceMetric):
-    """Chebyshev/Infinity Distance
-
-    .. math::
-       D(x, y) = max_i (|x_i - y_i|)
-
-    Examples
-    --------
-    >>> from sklearn.metrics.dist_metrics import DistanceMetric
-    >>> dist = DistanceMetric.get_metric('chebyshev')
-    >>> X = [[0, 1, 2],
-    ...      [3, 4, 5]]
-    >>> Y = [[-1, 0, 1],
-    ...      [3, 4, 5]]
-    >>> dist.pairwise(X, Y)
-    array([[1.732..., 5.196...],
-           [6.928..., 0....   ]])
-    """
-    def __init__(self):
-        self.p = INF
-
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d = 0
-        cdef np.intp_t j
-        for j in range(size):
-            d = <DTYPE_t> fmax(d, fabs(x1[j] - x2[j]))
-        return d
-
-
-#------------------------------------------------------------
-# Minkowski Distance
-cdef class MinkowskiDistance(DistanceMetric):
-    r"""Minkowski Distance
-
-    .. math::
-        D(x, y) = {||u-v||}_p
-
-    when w is None.
-
-    Here is the more general expanded expression for the weighted case:
-
-    .. math::
-        D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p)
-
-    Parameters
-    ----------
-    p : int
-        The order of the p-norm of the difference (see above).
-    w : (N,) array-like (optional)
-        The weight vector.
-
-    Minkowski Distance requires p >= 1 and finite. For p = infinity,
-    use ChebyshevDistance.
-    Note that for p=1, ManhattanDistance is more efficient, and for
-    p=2, EuclideanDistance is more efficient.
-    """
-    def __init__(self, p, w=None):
-        if p < 1:
-            raise ValueError("p must be greater than 1")
-        elif np.isinf(p):
-            raise ValueError("MinkowskiDistance requires finite p. "
-                             "For p=inf, use ChebyshevDistance.")
-
-        self.p = p
-        if w is not None:
-            w_array = check_array(
-                w, ensure_2d=False, dtype=DTYPE, input_name="w"
-            )
-            if (w_array < 0).any():
-                raise ValueError("w cannot contain negative weights")
-            self.vec = ReadonlyArrayWrapper(w_array)
-            self.size = self.vec.shape[0]
-        else:
-            self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=DTYPE))
-            self.size = 0
-
-    def _validate_data(self, X):
-        if self.size > 0 and X.shape[1] != self.size:
-            raise ValueError("MinkowskiDistance: the size of w must match "
-                             f"the number of features ({X.shape[1]}). "
-                             f"Currently len(w)={self.size}.")
-
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d=0
-        cdef np.intp_t j
-        cdef bint has_w = self.size > 0
-        if has_w:
-            for j in range(size):
-                d += <DTYPE_t> (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p))
-        else:
-            for j in range(size):
-                d += <DTYPE_t> (pow(fabs(x1[j] - x2[j]), self.p))
-        return d
-
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
-
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        return <DTYPE_t> pow(rdist, 1. / self.p)
-
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        return <DTYPE_t> pow(dist, self.p)
-
-    def rdist_to_dist(self, rdist):
-        return rdist ** (1. / self.p)
-
-    def dist_to_rdist(self, dist):
-        return dist ** self.p
-
-
-#------------------------------------------------------------
-# TODO: Remove in 1.3 - WMinkowskiDistance class
-# W-Minkowski Distance
-cdef class WMinkowskiDistance(DistanceMetric):
-    r"""Weighted Minkowski Distance
-
-    .. math::
-       D(x, y) = [\sum_i |w_i * (x_i - y_i)|^p] ^ (1/p)
-
-    Weighted Minkowski Distance requires p >= 1 and finite.
-
-    Parameters
-    ----------
-    p : int
-        The order of the norm of the difference :math:`{||u-v||}_p`.
-    w : (N,) array-like
-        The weight vector.
-
-    """
-    def __init__(self, p, w):
-        from warnings import warn
-        warn("WMinkowskiDistance is deprecated in version 1.1 and will be "
-            "removed in version 1.3. Use MinkowskiDistance instead. Note "
-            "that in MinkowskiDistance, the weights are applied to the "
-            "absolute differences raised to the p power. This is different "
-            "from WMinkowskiDistance where weights are applied to the "
-            "absolute differences before raising to the p power. "
-            "The deprecation aims to remain consistent with SciPy 1.8 "
-            "convention.", FutureWarning)
-
-        if p < 1:
-            raise ValueError("p must be greater than 1")
-        elif np.isinf(p):
-            raise ValueError("WMinkowskiDistance requires finite p. "
-                             "For p=inf, use ChebyshevDistance.")
-        self.p = p
-        self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype=DTYPE))
-        self.size = self.vec.shape[0]
-
-    def _validate_data(self, X):
-        if X.shape[1] != self.size:
-            raise ValueError('WMinkowskiDistance dist: '
-                             'size of w does not match')
-
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d = 0
-        cdef np.intp_t j
-        for j in range(size):
-            d += <DTYPE_t> (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p))
-        return d
-
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
-
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        return <DTYPE_t> pow(rdist, 1. / self.p)
-
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        return <DTYPE_t> pow(dist, self.p)
-
-    def rdist_to_dist(self, rdist):
-        return rdist ** (1. / self.p)
-
-    def dist_to_rdist(self, dist):
-        return dist ** self.p
-
-
-#------------------------------------------------------------
-# Mahalanobis Distance
-#  d = sqrt( (x - y)^T V^-1 (x - y) )
-cdef class MahalanobisDistance(DistanceMetric):
-    """Mahalanobis Distance
-
-    .. math::
-       D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) }
-
-    Parameters
-    ----------
-    V : array-like
-        Symmetric positive-definite covariance matrix.
-        The inverse of this matrix will be explicitly computed.
-    VI : array-like
-        optionally specify the inverse directly.  If VI is passed,
-        then V is not referenced.
-    """
-    def __init__(self, V=None, VI=None):
-        if VI is None:
-            if V is None:
-                raise ValueError("Must provide either V or VI "
-                                 "for Mahalanobis distance")
-            VI = np.linalg.inv(V)
-        if VI.ndim != 2 or VI.shape[0] != VI.shape[1]:
-            raise ValueError("V/VI must be square")
-
-        self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype=DTYPE, order='C'))
-
-        self.size = self.mat.shape[0]
-
-        # we need vec as a work buffer
-        self.vec = np.zeros(self.size, dtype=DTYPE)
-
-    def _validate_data(self, X):
-        if X.shape[1] != self.size:
-            raise ValueError('Mahalanobis dist: size of V does not match')
-
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t tmp, d = 0
-        cdef np.intp_t i, j
-
-        # compute (x1 - x2).T * VI * (x1 - x2)
-        for i in range(size):
-            self.vec[i] = x1[i] - x2[i]
-
-        for i in range(size):
-            tmp = 0
-            for j in range(size):
-                tmp += self.mat[i, j] * self.vec[j]
-            d += tmp * self.vec[i]
-        return d
-
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return sqrt(self.rdist(x1, x2, size))
-
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        return sqrt(rdist)
-
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        return dist * dist
-
-    def rdist_to_dist(self, rdist):
-        return np.sqrt(rdist)
-
-    def dist_to_rdist(self, dist):
-        return dist ** 2
-
-
-#------------------------------------------------------------
-# Hamming Distance
-#  d = N_unequal(x, y) / N_tot
-cdef class HammingDistance(DistanceMetric):
-    r"""Hamming Distance
-
-    Hamming distance is meant for discrete-valued vectors, though it is
-    a valid metric for real-valued vectors.
-
-    .. math::
-       D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int n_unequal = 0
-        cdef np.intp_t j
-        for j in range(size):
-            if x1[j] != x2[j]:
-                n_unequal += 1
-        return float(n_unequal) / size
-
-
-#------------------------------------------------------------
-# Canberra Distance
-#  D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ]
-cdef class CanberraDistance(DistanceMetric):
-    r"""Canberra Distance
-
-    Canberra distance is meant for discrete-valued vectors, though it is
-    a valid metric for real-valued vectors.
-
-    .. math::
-       D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef DTYPE_t denom, d = 0
-        cdef np.intp_t j
-        for j in range(size):
-            denom = <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
-            if denom > 0:
-                d += <DTYPE_t>(fabs(x1[j] - x2[j])) / denom
-        return d
-
-
-#------------------------------------------------------------
-# Bray-Curtis Distance
-#  D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)]
-cdef class BrayCurtisDistance(DistanceMetric):
-    r"""Bray-Curtis Distance
-
-    Bray-Curtis distance is meant for discrete-valued vectors, though it is
-    a valid metric for real-valued vectors.
-
-    .. math::
-       D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef DTYPE_t num = 0, denom = 0
-        cdef np.intp_t j
-        for j in range(size):
-            num += <DTYPE_t> fabs(x1[j] - x2[j])
-            denom += <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
-        if denom > 0:
-            return num / denom
-        else:
-            return 0.0
-
-
-#------------------------------------------------------------
-# Jaccard Distance (boolean)
-#  D(x, y) = N_unequal(x, y) / N_nonzero(x, y)
-cdef class JaccardDistance(DistanceMetric):
-    r"""Jaccard Distance
-
-    Jaccard Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, n_eq = 0, nnz = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            nnz += (tf1 or tf2)
-            n_eq += (tf1 and tf2)
-        # Based on https://github.com/scipy/scipy/pull/7373
-        # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
-        # was changed to return 0, instead of nan.
-        if nnz == 0:
-            return 0
-        return (nnz - n_eq) * 1.0 / nnz
-
-
-#------------------------------------------------------------
-# Matching Distance (boolean)
-#  D(x, y) = n_neq / n
-cdef class MatchingDistance(DistanceMetric):
-    r"""Matching Distance
-
-    Matching Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, n_neq = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            n_neq += (tf1 != tf2)
-        return n_neq * 1. / size
-
-
-#------------------------------------------------------------
-# Dice Distance (boolean)
-#  D(x, y) = n_neq / (2 * ntt + n_neq)
-cdef class DiceDistance(DistanceMetric):
-    r"""Dice Distance
-
-    Dice Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, n_neq = 0, ntt = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            ntt += (tf1 and tf2)
-            n_neq += (tf1 != tf2)
-        return n_neq / (2.0 * ntt + n_neq)
-
-
-#------------------------------------------------------------
-# Kulsinski Distance (boolean)
-#  D(x, y) = (ntf + nft - ntt + n) / (n_neq + n)
-cdef class KulsinskiDistance(DistanceMetric):
-    r"""Kulsinski Distance
-
-    Kulsinski Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, ntt = 0, n_neq = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            n_neq += (tf1 != tf2)
-            ntt += (tf1 and tf2)
-        return (n_neq - ntt + size) * 1.0 / (n_neq + size)
-
-
-#------------------------------------------------------------
-# Rogers-Tanimoto Distance (boolean)
-#  D(x, y) = 2 * n_neq / (n + n_neq)
-cdef class RogersTanimotoDistance(DistanceMetric):
-    r"""Rogers-Tanimoto Distance
-
-    Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, n_neq = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            n_neq += (tf1 != tf2)
-        return (2.0 * n_neq) / (size + n_neq)
-
-
-#------------------------------------------------------------
-# Russell-Rao Distance (boolean)
-#  D(x, y) = (n - ntt) / n
-cdef class RussellRaoDistance(DistanceMetric):
-    r"""Russell-Rao Distance
-
-    Russell-Rao Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, ntt = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            ntt += (tf1 and tf2)
-        return (size - ntt) * 1. / size
-
-
-#------------------------------------------------------------
-# Sokal-Michener Distance (boolean)
-#  D(x, y) = 2 * n_neq / (n + n_neq)
-cdef class SokalMichenerDistance(DistanceMetric):
-    r"""Sokal-Michener Distance
-
-    Sokal-Michener Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, n_neq = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            n_neq += (tf1 != tf2)
-        return (2.0 * n_neq) / (size + n_neq)
-
-
-#------------------------------------------------------------
-# Sokal-Sneath Distance (boolean)
-#  D(x, y) = n_neq / (0.5 * n_tt + n_neq)
-cdef class SokalSneathDistance(DistanceMetric):
-    r"""Sokal-Sneath Distance
-
-    Sokal-Sneath Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, ntt = 0, n_neq = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            n_neq += (tf1 != tf2)
-            ntt += (tf1 and tf2)
-        return n_neq / (0.5 * ntt + n_neq)
-
-
-#------------------------------------------------------------
-# Haversine Distance (2 dimensional)
-#  D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2)
-#                          + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]}
-cdef class HaversineDistance(DistanceMetric):
-    """Haversine (Spherical) Distance
-
-    The Haversine distance is the angular distance between two points on
-    the surface of a sphere.  The first distance of each point is assumed
-    to be the latitude, the second is the longitude, given in radians.
-    The dimension of the points must be 2:
-
-    """
-
-    def _validate_data(self, X):
-        if X.shape[1] != 2:
-            raise ValueError("Haversine distance only valid "
-                             "in 2 dimensions")
-
-    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t sin_0 = <DTYPE_t> sin(0.5 * (x1[0] - x2[0]))
-        cdef DTYPE_t sin_1 = <DTYPE_t> sin(0.5 * (x1[1] - x2[1]))
-        return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
-
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return 2 * asin(sqrt(self.rdist(x1, x2, size)))
-
-    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        return 2 * asin(sqrt(rdist))
-
-    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        cdef DTYPE_t tmp = <DTYPE_t> sin(0.5 * dist)
-        return tmp * tmp
-
-    def rdist_to_dist(self, rdist):
-        return 2 * np.arcsin(np.sqrt(rdist))
-
-    def dist_to_rdist(self, dist):
-        tmp = np.sin(0.5 * dist)
-        return tmp * tmp
-
-#------------------------------------------------------------
-# User-defined distance
-#
-cdef class PyFuncDistance(DistanceMetric):
-    """PyFunc Distance
-
-    A user-defined distance
-
-    Parameters
-    ----------
-    func : function
-        func should take two numpy arrays as input, and return a distance.
-    """
-    def __init__(self, func, **kwargs):
-        self.func = func
-        self.kwargs = kwargs
-
-    # in cython < 0.26, GIL was required to be acquired during definition of
-    # the function and inside the body of the function. This behaviour is not
-    # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
-    # only way to be back compatible is to inherit `dist` from the base class
-    # without GIL and called an inline `_dist` which acquire GIL.
-    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return self._dist(x1, x2, size)
-
-    cdef inline DTYPE_t _dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
-                              ITYPE_t size) except -1 with gil:
-        cdef np.ndarray x1arr
-        cdef np.ndarray x2arr
-        x1arr = _buffer_to_ndarray(x1, size)
-        x2arr = _buffer_to_ndarray(x2, size)
-        d = self.func(x1arr, x2arr, **self.kwargs)
-        try:
-            # Cython generates code here that results in a TypeError
-            # if d is the wrong type.
-            return d
-        except TypeError:
-            raise TypeError("Custom distance function must accept two "
-                            "vectors and return a float.")
-
-
-######################################################################
-# Datasets Pair Classes
-cdef class DatasetsPair:
-    """Abstract class which wraps a pair of datasets (X, Y).
-
-    This class allows computing distances between a single pair of rows of
-    of X and Y at a time given the pair of their indices (i, j). This class is
-    specialized for each metric thanks to the :func:`get_for` factory classmethod.
-
-    The handling of parallelization over chunks to compute the distances
-    and aggregation for several rows at a time is done in dedicated
-    subclasses of PairwiseDistancesReduction that in-turn rely on
-    subclasses of DatasetsPair for each pair of rows in the data. The goal
-    is to make it possible to decouple the generic parallelization and
-    aggregation logic from metric-specific computation as much as
-    possible.
-
-    X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
-    in subclasses.
-
-    This class avoids the overhead of dispatching distance computations
-    to :class:`sklearn.metrics.DistanceMetric` based on the physical
-    representation of the vectors (sparse vs. dense). It makes use of
-    cython.final to remove the overhead of dispatching method calls.
-
-    Parameters
-    ----------
-    distance_metric: DistanceMetric
-        The distance metric responsible for computing distances
-        between two vectors of (X, Y).
-    """
-
-    @classmethod
-    def get_for(
-        cls,
-        X,
-        Y,
-        str metric="euclidean",
-        dict metric_kwargs=None,
-    ) -> DatasetsPair:
-        """Return the DatasetsPair implementation for the given arguments.
-
-        Parameters
-        ----------
-        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
-
-        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
-
-        metric : str, default='euclidean'
-            The distance metric to compute between rows of X and Y.
-            The default metric is a fast implementation of the Euclidean
-            metric. For a list of available metrics, see the documentation
-            of :class:`~sklearn.metrics.DistanceMetric`.
-
-        metric_kwargs : dict, default=None
-            Keyword arguments to pass to specified metric function.
-
-        Returns
-        -------
-        datasets_pair: DatasetsPair
-            The suited DatasetsPair implementation.
-        """
-        cdef:
-            DistanceMetric distance_metric = DistanceMetric.get_metric(
-                metric,
-                **(metric_kwargs or {})
-            )
-
-        if not(X.dtype == Y.dtype == np.float64):
-            raise ValueError(
-                f"Datasets must be of np.float64 type. "
-                f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
-            )
-
-        # Metric-specific checks that do not replace nor duplicate `check_array`.
-        distance_metric._validate_data(X)
-        distance_metric._validate_data(Y)
-
-        # TODO: dispatch to other dataset pairs for sparse support once available:
-        if issparse(X) or issparse(Y):
-            raise ValueError("Only dense datasets are supported for X and Y.")
-
-        return DenseDenseDatasetsPair(X, Y, distance_metric)
-
-    def __init__(self, DistanceMetric distance_metric):
-        self.distance_metric = distance_metric
-
-    cdef ITYPE_t n_samples_X(self) nogil:
-        """Number of samples in X."""
-        # This is a abstract method.
-        # This _must_ always be overwritten in subclasses.
-        # TODO: add "with gil: raise" here when supporting Cython 3.0
-        return -999
-
-    cdef ITYPE_t n_samples_Y(self) nogil:
-        """Number of samples in Y."""
-        # This is a abstract method.
-        # This _must_ always be overwritten in subclasses.
-        # TODO: add "with gil: raise" here when supporting Cython 3.0
-        return -999
-
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        return self.dist(i, j)
-
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        # This is a abstract method.
-        # This _must_ always be overwritten in subclasses.
-        # TODO: add "with gil: raise" here when supporting Cython 3.0
-        return -1
-
-@final
-cdef class DenseDenseDatasetsPair(DatasetsPair):
-    """Compute distances between row vectors of two arrays.
-
-    Parameters
-    ----------
-    X: ndarray of shape (n_samples_X, n_features)
-        Rows represent vectors. Must be C-contiguous.
-
-    Y: ndarray of shape (n_samples_Y, n_features)
-        Rows represent vectors. Must be C-contiguous.
-
-    distance_metric: DistanceMetric
-        The distance metric responsible for computing distances
-        between two row vectors of (X, Y).
-    """
-
-    def __init__(self, X, Y, DistanceMetric distance_metric):
-        super().__init__(distance_metric)
-        # Arrays have already been checked
-        self.X = X
-        self.Y = Y
-        self.d = X.shape[1]
-
-    @final
-    cdef ITYPE_t n_samples_X(self) nogil:
-        return self.X.shape[0]
-
-    @final
-    cdef ITYPE_t n_samples_Y(self) nogil:
-        return self.Y.shape[0]
-
-    @final
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        return self.distance_metric.rdist(&self.X[i, 0],
-                                          &self.Y[j, 0],
-                                          self.d)
-
-    @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        return self.distance_metric.dist(&self.X[i, 0],
-                                         &self.Y[j, 0],
-                                         self.d)
-
-######################################################################
-# metric mappings
-#  These map from metric id strings to class names
-METRIC_MAPPING32 = {
-    'euclidean': EuclideanDistance32,
-    'l2': EuclideanDistance32,
-    'minkowski': MinkowskiDistance32,
-    'p': MinkowskiDistance32,
-    'manhattan': ManhattanDistance32,
-    'cityblock': ManhattanDistance32,
-    'l1': ManhattanDistance32,
-    'chebyshev': ChebyshevDistance32,
-    'infinity': ChebyshevDistance32,
-    'seuclidean': SEuclideanDistance32,
-    'mahalanobis': MahalanobisDistance32,
-    'wminkowski': WMinkowskiDistance32,
-    'hamming': HammingDistance32,
-    'canberra': CanberraDistance32,
-    'braycurtis': BrayCurtisDistance32,
-    'matching': MatchingDistance32,
-    'jaccard': JaccardDistance32,
-    'dice': DiceDistance32,
-    'kulsinski': KulsinskiDistance32,
-    'rogerstanimoto': RogersTanimotoDistance32,
-    'russellrao': RussellRaoDistance32,
-    'sokalmichener': SokalMichenerDistance32,
-    'sokalsneath': SokalSneathDistance32,
-    'haversine': HaversineDistance32,
-    'pyfunc': PyFuncDistance32,
-}
-
-cdef inline np.ndarray _buffer_to_ndarray32(const np.float32_t* x, np.npy_intp n):
-    # Wrap a memory buffer with an ndarray. Warning: this is not robust.
-    # In particular, if x is deallocated before the returned array goes
-    # out of scope, this could cause memory errors.  Since there is not
-    # a possibility of this for our use-case, this should be safe.
-
-    # Note: this Segfaults unless np.import_array() is called above
-    return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
-
-
-from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
-cdef np.float32_t INF32 = np.inf
-
-
-######################################################################
-# Distance Metric Classes
-cdef class DistanceMetric32:
-    """DistanceMetric class
-
-    This class provides a uniform interface to fast distance metric
-    functions.  The various metrics can be accessed via the :meth:`get_metric`
-    class method and the metric string identifier (see below).
-
-    Examples
-    --------
-    >>> from sklearn.metrics import DistanceMetric
-    >>> dist = DistanceMetric.get_metric('euclidean')
-    >>> X = [[0, 1, 2],
-             [3, 4, 5]]
-    >>> dist.pairwise(X)
-    array([[ 0.        ,  5.19615242],
-           [ 5.19615242,  0.        ]])
-
-    Available Metrics
-
-    The following lists the string metric identifiers and the associated
-    distance metric classes:
-
-    **Metrics intended for real-valued vector spaces:**
-
-    ==============  ====================  ========  ===============================
-    identifier      class name            args      distance function
-    --------------  --------------------  --------  -------------------------------
-    "euclidean"     EuclideanDistance     -         ``sqrt(sum((x - y)^2))``
-    "manhattan"     ManhattanDistance     -         ``sum(|x - y|)``
-    "chebyshev"     ChebyshevDistance     -         ``max(|x - y|)``
-    "minkowski"     MinkowskiDistance     p, w      ``sum(w * |x - y|^p)^(1/p)``
-    "wminkowski"    WMinkowskiDistance    p, w      ``sum(|w * (x - y)|^p)^(1/p)``
-    "seuclidean"    SEuclideanDistance    V         ``sqrt(sum((x - y)^2 / V))``
-    "mahalanobis"   MahalanobisDistance   V or VI   ``sqrt((x - y)' V^-1 (x - y))``
-    ==============  ====================  ========  ===============================
-
-    .. deprecated:: 1.1
-        `WMinkowskiDistance` is deprecated in version 1.1 and will be removed in version 1.3.
-        Use `MinkowskiDistance` instead. Note that in `MinkowskiDistance`, the weights are
-        applied to the absolute differences already raised to the p power. This is different from
-        `WMinkowskiDistance` where weights are applied to the absolute differences before raising
-        to the p power. The deprecation aims to remain consistent with SciPy 1.8 convention.
-
-    **Metrics intended for two-dimensional vector spaces:**  Note that the haversine
-    distance metric requires data in the form of [latitude, longitude] and both
-    inputs and outputs are in units of radians.
-
-    ============  ==================  ===============================================================
-    identifier    class name          distance function
-    ------------  ------------------  ---------------------------------------------------------------
-    "haversine"   HaversineDistance   ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))``
-    ============  ==================  ===============================================================
-
-
-    **Metrics intended for integer-valued vector spaces:**  Though intended
-    for integer-valued vectors, these are also valid metrics in the case of
-    real-valued vectors.
-
-    =============  ====================  ========================================
-    identifier     class name            distance function
-    -------------  --------------------  ----------------------------------------
-    "hamming"      HammingDistance       ``N_unequal(x, y) / N_tot``
-    "canberra"     CanberraDistance      ``sum(|x - y| / (|x| + |y|))``
-    "braycurtis"   BrayCurtisDistance    ``sum(|x - y|) / (sum(|x|) + sum(|y|))``
-    =============  ====================  ========================================
-
-    **Metrics intended for boolean-valued vector spaces:**  Any nonzero entry
-    is evaluated to "True".  In the listings below, the following
-    abbreviations are used:
-
-     - N  : number of dimensions
-     - NTT : number of dims in which both values are True
-     - NTF : number of dims in which the first value is True, second is False
-     - NFT : number of dims in which the first value is False, second is True
-     - NFF : number of dims in which both values are False
-     - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT
-     - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT
-
-    =================  =======================  ===============================
-    identifier         class name               distance function
-    -----------------  -----------------------  -------------------------------
-    "jaccard"          JaccardDistance          NNEQ / NNZ
-    "matching"         MatchingDistance         NNEQ / N
-    "dice"             DiceDistance             NNEQ / (NTT + NNZ)
-    "kulsinski"        KulsinskiDistance        (NNEQ + N - NTT) / (NNEQ + N)
-    "rogerstanimoto"   RogersTanimotoDistance   2 * NNEQ / (N + NNEQ)
-    "russellrao"       RussellRaoDistance       (N - NTT) / N
-    "sokalmichener"    SokalMichenerDistance    2 * NNEQ / (N + NNEQ)
-    "sokalsneath"      SokalSneathDistance      NNEQ / (NNEQ + 0.5 * NTT)
-    =================  =======================  ===============================
-
-    **User-defined distance:**
-
-    ===========    ===============    =======
-    identifier     class name         args
-    -----------    ---------------    -------
-    "pyfunc"       PyFuncDistance     func
-    ===========    ===============    =======
-
-    Here ``func`` is a function which takes two one-dimensional numpy
-    arrays, and returns a distance.  Note that in order to be used within
-    the BallTree, the distance must be a true metric:
-    i.e. it must satisfy the following properties
-
-    1) Non-negativity: d(x, y) >= 0
-    2) Identity: d(x, y) = 0 if and only if x == y
-    3) Symmetry: d(x, y) = d(y, x)
-    4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)
-
-    Because of the Python object overhead involved in calling the python
-    function, this will be fairly slow, but it will have the same
-    scaling as other distances.
-    """
-    def __cinit__(self):
-        self.p = 2
-        self.vec = np.zeros(1, dtype=np.float32, order='C')
-        self.mat = np.zeros((1, 1), dtype=np.float32, order='C')
-        self.size = 1
-
-    def __reduce__(self):
-        """
-        reduce method used for pickling
-        """
-        return (newObj, (self.__class__,), self.__getstate__())
-
-    def __getstate__(self):
-        """
-        get state for pickling
-        """
-        if self.__class__.__name__ == "PyFuncDistance32":
-            return (float(self.p), np.asarray(self.vec), np.asarray(self.mat), self.func, self.kwargs)
-        return (float(self.p), np.asarray(self.vec), np.asarray(self.mat))
-
-    def __setstate__(self, state):
-        """
-        set state for pickling
-        """
-        self.p = state[0]
-        self.vec = ReadonlyArrayWrapper(state[1])
-        self.mat = ReadonlyArrayWrapper(state[2])
-        if self.__class__.__name__ == "PyFuncDistance32":
-            self.func = state[3]
-            self.kwargs = state[4]
-        self.size = self.vec.shape[0]
-
-    @classmethod
-    def get_metric(cls, metric, **kwargs):
-        """Get the given distance metric from the string identifier.
-
-        See the docstring of DistanceMetric for a list of available metrics.
-
-        Parameters
-        ----------
-        metric : str or class name
-            The distance metric to use
-        **kwargs
-            additional arguments will be passed to the requested metric
-        """
-        if isinstance(metric, DistanceMetric32):
-            return metric
-
-        if callable(metric):
-            return PyFuncDistance32(metric, **kwargs)
-
-        # Map the metric string ID to the metric class
-        if isinstance(metric, type) and issubclass(metric, DistanceMetric32):
-            pass
-        else:
-            try:
-                metric = METRIC_MAPPING32[metric]
-            except:
-                raise ValueError("Unrecognized metric '%s'" % metric)
-
-        # In Minkowski special cases, return more efficient methods
-        if metric is MinkowskiDistance32:
-            p = kwargs.pop('p', 2)
-            w = kwargs.pop('w', None)
-            if p == 1 and w is None:
-                return ManhattanDistance32(**kwargs)
-            elif p == 2 and w is None:
-                return EuclideanDistance32(**kwargs)
-            elif np.isinf(p) and w is None:
-                return ChebyshevDistance32(**kwargs)
-            else:
-                return MinkowskiDistance32(p, w, **kwargs)
-        else:
-            return metric(**kwargs)
-
-    def __init__(self):
-        if self.__class__ is DistanceMetric32:
-            raise NotImplementedError("DistanceMetric32 is an abstract class")
-
-    def _validate_data(self, X):
-        """Validate the input data.
-
-        This should be overridden in a base class if a specific input format
-        is required.
-        """
-        return
-
-    cdef DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                      ITYPE_t size) nogil except -1:
-        """Compute the distance between vectors x1 and x2
-
-        This should be overridden in a base class.
-        """
-        return -999
-
-    cdef DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
-                       ITYPE_t size) nogil except -1:
-        """Compute the rank-preserving surrogate distance between vectors x1 and x2.
-
-        This can optionally be overridden in a base class.
-
-        The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute. For example, the
-        rank-preserving surrogate distance of the Euclidean metric is the
-        squared-euclidean distance.
-        """
-        return self.dist(x1, x2, size)
-
-    cdef int pdist(self, const np.float32_t[:, ::1] X, np.float32_t[:, ::1] D) except -1:
-        """compute the pairwise distances between points in X"""
-        cdef ITYPE_t i1, i2
-        for i1 in range(X.shape[0]):
-            for i2 in range(i1, X.shape[0]):
-                D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1])
-                D[i2, i1] = D[i1, i2]
-        return 0
-
-    cdef int cdist(self, const np.float32_t[:, ::1] X, const np.float32_t[:, ::1] Y,
-                   np.float32_t[:, ::1] D) except -1:
-        """compute the cross-pairwise distances between arrays X and Y"""
-        cdef ITYPE_t i1, i2
-        if X.shape[1] != Y.shape[1]:
-            raise ValueError('X and Y must have the same second dimension')
-        for i1 in range(X.shape[0]):
-            for i2 in range(Y.shape[0]):
-                D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1])
-        return 0
-
-    cdef DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
-        """Convert the rank-preserving surrogate distance to the distance"""
-        return rdist
-
-    cdef DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
-        """Convert the distance to the rank-preserving surrogate distance"""
-        return dist
-
-    def rdist_to_dist(self, rdist):
-        """Convert the rank-preserving surrogate distance to the distance.
-
-        The surrogate distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute. For example, the
-        rank-preserving surrogate distance of the Euclidean metric is the
-        squared-euclidean distance.
-
-        Parameters
-        ----------
-        rdist : double
-            Surrogate distance.
-
-        Returns
-        -------
-        double
-            True distance.
-        """
-        return rdist
-
-    def dist_to_rdist(self, dist):
-        """Convert the true distance to the rank-preserving surrogate distance.
-
-        The surrogate distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute. For example, the
-        rank-preserving surrogate distance of the Euclidean metric is the
-        squared-euclidean distance.
-
-        Parameters
-        ----------
-        dist : double
-            True distance.
-
-        Returns
-        -------
-        double
-            Surrogate distance.
-        """
-        return dist
-
-    def pairwise(self, X, Y=None):
-        """Compute the pairwise distances between X and Y
-
-        This is a convenience routine for the sake of testing.  For many
-        metrics, the utilities in scipy.spatial.distance.cdist and
-        scipy.spatial.distance.pdist will be faster.
-
-        Parameters
-        ----------
-        X : array-like
-            Array of shape (Nx, D), representing Nx points in D dimensions.
-        Y : array-like (optional)
-            Array of shape (Ny, D), representing Ny points in D dimensions.
-            If not specified, then Y=X.
-
-        Returns
-        -------
-        dist : ndarray
-            The shape (Nx, Ny) array of pairwise distances between points in
-            X and Y.
-        """
-        cdef np.ndarray[np.float32_t, ndim=2, mode='c'] Xarr
-        cdef np.ndarray[np.float32_t, ndim=2, mode='c'] Yarr
-        cdef np.ndarray[np.float32_t, ndim=2, mode='c'] Darr
-
-        Xarr = np.asarray(X, dtype=np.float32, order='C')
-        self._validate_data(Xarr)
-        if Y is None:
-            Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]),
-                         dtype=np.float32, order='C')
-            self.pdist(Xarr, Darr)
-        else:
-            Yarr = np.asarray(Y, dtype=np.float32, order='C')
-            self._validate_data(Yarr)
-            Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]),
-                         dtype=np.float32, order='C')
-            self.cdist(Xarr, Yarr, Darr)
-        return Darr
-
-
-#------------------------------------------------------------
-# Euclidean Distance
-#  d = sqrt(sum(x_i^2 - y_i^2))
-cdef class EuclideanDistance32(DistanceMetric32):
-    r"""Euclidean Distance metric
-
-    .. math::
-       D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 }
-    """
-    def __init__(self):
-        self.p = 2
-
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return euclidean_dist32(x1, x2, size)
-
-    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
-                              ITYPE_t size) nogil except -1:
-        return euclidean_rdist32(x1, x2, size)
-
-    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
-        return sqrt(rdist)
-
-    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
-        return dist * dist
-
-    def rdist_to_dist(self, rdist):
-        return np.sqrt(rdist)
-
-    def dist_to_rdist(self, dist):
-        return dist ** 2
-
-
-#------------------------------------------------------------
-# SEuclidean Distance
-#  d = sqrt(sum((x_i - y_i2)^2 / v_i))
-cdef class SEuclideanDistance32(DistanceMetric32):
-    r"""Standardized Euclidean Distance metric
-
-    .. math::
-       D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} }
-    """
-    def __init__(self, V):
-        self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype=np.float32))
-        self.size = self.vec.shape[0]
-        self.p = 2
-
-    def _validate_data(self, X):
-        if X.shape[1] != self.size:
-            raise ValueError('SEuclidean dist: size of V does not match')
-
-    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
-                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t tmp, d=0
-        cdef np.intp_t j
-        for j in range(size):
-            tmp = <DTYPE_t> (x1[j] - x2[j])
-            d += <DTYPE_t> (tmp * tmp / self.vec[j])
-        return d
-
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return sqrt(self.rdist(x1, x2, size))
-
-    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
-        return sqrt(rdist)
-
-    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
-        return dist * dist
-
-    def rdist_to_dist(self, rdist):
-        return np.sqrt(rdist)
-
-    def dist_to_rdist(self, dist):
-        return dist ** 2
-
-
-#------------------------------------------------------------
-# Manhattan Distance
-#  d = sum(abs(x_i - y_i))
-cdef class ManhattanDistance32(DistanceMetric32):
-    r"""Manhattan/City-block Distance metric
-
-    .. math::
-       D(x, y) = \sum_i |x_i - y_i|
-    """
-    def __init__(self):
-        self.p = 1
-
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d = 0
-        cdef np.intp_t j
-        for j in range(size):
-            d += <DTYPE_t> fabs(x1[j] - x2[j])
-        return d
-
-
-#------------------------------------------------------------
-# Chebyshev Distance
-#  d = max_i(abs(x_i - y_i))
-cdef class ChebyshevDistance32(DistanceMetric32):
-    """Chebyshev/Infinity Distance
-
-    .. math::
-       D(x, y) = max_i (|x_i - y_i|)
-
-    Examples
-    --------
-    >>> from sklearn.metrics.dist_metrics import DistanceMetric
-    >>> dist = DistanceMetric.get_metric('chebyshev')
-    >>> X = [[0, 1, 2],
-    ...      [3, 4, 5]]
-    >>> Y = [[-1, 0, 1],
-    ...      [3, 4, 5]]
-    >>> dist.pairwise(X, Y)
-    array([[1.732..., 5.196...],
-           [6.928..., 0....   ]])
-    """
-    def __init__(self):
-        self.p = INF32
-
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d = 0
-        cdef np.intp_t j
-        for j in range(size):
-            d = <DTYPE_t> fmax(d, fabs(x1[j] - x2[j]))
-        return d
-
-
-#------------------------------------------------------------
-# Minkowski Distance
-cdef class MinkowskiDistance32(DistanceMetric32):
-    r"""Minkowski Distance
-
-    .. math::
-        D(x, y) = {||u-v||}_p
-
-    when w is None.
-
-    Here is the more general expanded expression for the weighted case:
-
-    .. math::
-        D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p)
-
-    Parameters
-    ----------
-    p : int
-        The order of the p-norm of the difference (see above).
-    w : (N,) array-like (optional)
-        The weight vector.
-
-    Minkowski Distance requires p >= 1 and finite. For p = infinity,
-    use ChebyshevDistance.
-    Note that for p=1, ManhattanDistance is more efficient, and for
-    p=2, EuclideanDistance is more efficient.
-    """
-    def __init__(self, p, w=None):
-        if p < 1:
-            raise ValueError("p must be greater than 1")
-        elif np.isinf(p):
-            raise ValueError("MinkowskiDistance requires finite p. "
-                             "For p=inf, use ChebyshevDistance.")
-
-        self.p = p
-        if w is not None:
-            w_array = check_array(
-                w, ensure_2d=False, dtype=np.float32, input_name="w"
-            )
-            if (w_array < 0).any():
-                raise ValueError("w cannot contain negative weights")
-            self.vec = ReadonlyArrayWrapper(w_array)
-            self.size = self.vec.shape[0]
-        else:
-            self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=np.float32))
-            self.size = 0
-
-    def _validate_data(self, X):
-        if self.size > 0 and X.shape[1] != self.size:
-            raise ValueError("MinkowskiDistance: the size of w must match "
-                             f"the number of features ({X.shape[1]}). "
-                             f"Currently len(w)={self.size}.")
-
-    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
-                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d=0
-        cdef np.intp_t j
-        cdef bint has_w = self.size > 0
-        if has_w:
-            for j in range(size):
-                d += <DTYPE_t> (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p))
-        else:
-            for j in range(size):
-                d += <DTYPE_t> (pow(fabs(x1[j] - x2[j]), self.p))
-        return d
-
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
-
-    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
-        return <DTYPE_t> pow(rdist, 1. / self.p)
-
-    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
-        return <DTYPE_t> pow(dist, self.p)
-
-    def rdist_to_dist(self, rdist):
-        return rdist ** (1. / self.p)
-
-    def dist_to_rdist(self, dist):
-        return dist ** self.p
-
-
-#------------------------------------------------------------
-# TODO: Remove in 1.3 - WMinkowskiDistance class
-# W-Minkowski Distance
-cdef class WMinkowskiDistance32(DistanceMetric32):
-    r"""Weighted Minkowski Distance
-
-    .. math::
-       D(x, y) = [\sum_i |w_i * (x_i - y_i)|^p] ^ (1/p)
-
-    Weighted Minkowski Distance requires p >= 1 and finite.
-
-    Parameters
-    ----------
-    p : int
-        The order of the norm of the difference :math:`{||u-v||}_p`.
-    w : (N,) array-like
-        The weight vector.
-
-    """
-    def __init__(self, p, w):
-        from warnings import warn
-        warn("WMinkowskiDistance is deprecated in version 1.1 and will be "
-            "removed in version 1.3. Use MinkowskiDistance instead. Note "
-            "that in MinkowskiDistance, the weights are applied to the "
-            "absolute differences raised to the p power. This is different "
-            "from WMinkowskiDistance where weights are applied to the "
-            "absolute differences before raising to the p power. "
-            "The deprecation aims to remain consistent with SciPy 1.8 "
-            "convention.", FutureWarning)
-
-        if p < 1:
-            raise ValueError("p must be greater than 1")
-        elif np.isinf(p):
-            raise ValueError("WMinkowskiDistance requires finite p. "
-                             "For p=inf, use ChebyshevDistance.")
-        self.p = p
-        self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype=np.float32))
-        self.size = self.vec.shape[0]
-
-    def _validate_data(self, X):
-        if X.shape[1] != self.size:
-            raise ValueError('WMinkowskiDistance dist: '
-                             'size of w does not match')
-
-    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
-                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t d = 0
-        cdef np.intp_t j
-        for j in range(size):
-            d += <DTYPE_t> (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p))
-        return d
-
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return <DTYPE_t> pow(self.rdist(x1, x2, size), 1. / self.p)
-
-    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
-        return <DTYPE_t> pow(rdist, 1. / self.p)
-
-    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
-        return <DTYPE_t> pow(dist, self.p)
-
-    def rdist_to_dist(self, rdist):
-        return rdist ** (1. / self.p)
-
-    def dist_to_rdist(self, dist):
-        return dist ** self.p
-
-
-#------------------------------------------------------------
-# Mahalanobis Distance
-#  d = sqrt( (x - y)^T V^-1 (x - y) )
-cdef class MahalanobisDistance32(DistanceMetric32):
-    """Mahalanobis Distance
-
-    .. math::
-       D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) }
-
-    Parameters
-    ----------
-    V : array-like
-        Symmetric positive-definite covariance matrix.
-        The inverse of this matrix will be explicitly computed.
-    VI : array-like
-        optionally specify the inverse directly.  If VI is passed,
-        then V is not referenced.
-    """
-    def __init__(self, V=None, VI=None):
-        if VI is None:
-            if V is None:
-                raise ValueError("Must provide either V or VI "
-                                 "for Mahalanobis distance")
-            VI = np.linalg.inv(V)
-        if VI.ndim != 2 or VI.shape[0] != VI.shape[1]:
-            raise ValueError("V/VI must be square")
-
-        self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype=np.float32, order='C'))
-
-        self.size = self.mat.shape[0]
-
-        # we need vec as a work buffer
-        self.vec = np.zeros(self.size, dtype=np.float32)
-
-    def _validate_data(self, X):
-        if X.shape[1] != self.size:
-            raise ValueError('Mahalanobis dist: size of V does not match')
-
-    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
-                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t tmp, d = 0
-        cdef np.intp_t i, j
-
-        # compute (x1 - x2).T * VI * (x1 - x2)
-        for i in range(size):
-            self.vec[i] = x1[i] - x2[i]
-
-        for i in range(size):
-            tmp = 0
-            for j in range(size):
-                tmp += self.mat[i, j] * self.vec[j]
-            d += tmp * self.vec[i]
-        return d
-
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return sqrt(self.rdist(x1, x2, size))
-
-    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
-        return sqrt(rdist)
-
-    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
-        return dist * dist
-
-    def rdist_to_dist(self, rdist):
-        return np.sqrt(rdist)
-
-    def dist_to_rdist(self, dist):
-        return dist ** 2
-
-
-#------------------------------------------------------------
-# Hamming Distance
-#  d = N_unequal(x, y) / N_tot
-cdef class HammingDistance32(DistanceMetric32):
-    r"""Hamming Distance
-
-    Hamming distance is meant for discrete-valued vectors, though it is
-    a valid metric for real-valued vectors.
-
-    .. math::
-       D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int n_unequal = 0
-        cdef np.intp_t j
-        for j in range(size):
-            if x1[j] != x2[j]:
-                n_unequal += 1
-        return float(n_unequal) / size
-
-
-#------------------------------------------------------------
-# Canberra Distance
-#  D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ]
-cdef class CanberraDistance32(DistanceMetric32):
-    r"""Canberra Distance
-
-    Canberra distance is meant for discrete-valued vectors, though it is
-    a valid metric for real-valued vectors.
-
-    .. math::
-       D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef DTYPE_t denom, d = 0
-        cdef np.intp_t j
-        for j in range(size):
-            denom = <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
-            if denom > 0:
-                d += <DTYPE_t>(fabs(x1[j] - x2[j])) / denom
-        return d
-
-
-#------------------------------------------------------------
-# Bray-Curtis Distance
-#  D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)]
-cdef class BrayCurtisDistance32(DistanceMetric32):
-    r"""Bray-Curtis Distance
-
-    Bray-Curtis distance is meant for discrete-valued vectors, though it is
-    a valid metric for real-valued vectors.
-
-    .. math::
-       D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef DTYPE_t num = 0, denom = 0
-        cdef np.intp_t j
-        for j in range(size):
-            num += <DTYPE_t> fabs(x1[j] - x2[j])
-            denom += <DTYPE_t> (fabs(x1[j]) + fabs(x2[j]))
-        if denom > 0:
-            return num / denom
-        else:
-            return 0.0
-
-
-#------------------------------------------------------------
-# Jaccard Distance (boolean)
-#  D(x, y) = N_unequal(x, y) / N_nonzero(x, y)
-cdef class JaccardDistance32(DistanceMetric32):
-    r"""Jaccard Distance
-
-    Jaccard Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, n_eq = 0, nnz = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            nnz += (tf1 or tf2)
-            n_eq += (tf1 and tf2)
-        # Based on https://github.com/scipy/scipy/pull/7373
-        # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
-        # was changed to return 0, instead of nan.
-        if nnz == 0:
-            return 0
-        return (nnz - n_eq) * 1.0 / nnz
-
-
-#------------------------------------------------------------
-# Matching Distance (boolean)
-#  D(x, y) = n_neq / n
-cdef class MatchingDistance32(DistanceMetric32):
-    r"""Matching Distance
-
-    Matching Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, n_neq = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            n_neq += (tf1 != tf2)
-        return n_neq * 1. / size
-
-
-#------------------------------------------------------------
-# Dice Distance (boolean)
-#  D(x, y) = n_neq / (2 * ntt + n_neq)
-cdef class DiceDistance32(DistanceMetric32):
-    r"""Dice Distance
-
-    Dice Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, n_neq = 0, ntt = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            ntt += (tf1 and tf2)
-            n_neq += (tf1 != tf2)
-        return n_neq / (2.0 * ntt + n_neq)
-
-
-#------------------------------------------------------------
-# Kulsinski Distance (boolean)
-#  D(x, y) = (ntf + nft - ntt + n) / (n_neq + n)
-cdef class KulsinskiDistance32(DistanceMetric32):
-    r"""Kulsinski Distance
-
-    Kulsinski Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, ntt = 0, n_neq = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            n_neq += (tf1 != tf2)
-            ntt += (tf1 and tf2)
-        return (n_neq - ntt + size) * 1.0 / (n_neq + size)
-
-
-#------------------------------------------------------------
-# Rogers-Tanimoto Distance (boolean)
-#  D(x, y) = 2 * n_neq / (n + n_neq)
-cdef class RogersTanimotoDistance32(DistanceMetric32):
-    r"""Rogers-Tanimoto Distance
-
-    Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, n_neq = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            n_neq += (tf1 != tf2)
-        return (2.0 * n_neq) / (size + n_neq)
-
-
-#------------------------------------------------------------
-# Russell-Rao Distance (boolean)
-#  D(x, y) = (n - ntt) / n
-cdef class RussellRaoDistance32(DistanceMetric32):
-    r"""Russell-Rao Distance
-
-    Russell-Rao Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, ntt = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            ntt += (tf1 and tf2)
-        return (size - ntt) * 1. / size
-
-
-#------------------------------------------------------------
-# Sokal-Michener Distance (boolean)
-#  D(x, y) = 2 * n_neq / (n + n_neq)
-cdef class SokalMichenerDistance32(DistanceMetric32):
-    r"""Sokal-Michener Distance
-
-    Sokal-Michener Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, n_neq = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            n_neq += (tf1 != tf2)
-        return (2.0 * n_neq) / (size + n_neq)
-
-
-#------------------------------------------------------------
-# Sokal-Sneath Distance (boolean)
-#  D(x, y) = n_neq / (0.5 * n_tt + n_neq)
-cdef class SokalSneathDistance32(DistanceMetric32):
-    r"""Sokal-Sneath Distance
-
-    Sokal-Sneath Distance is a dissimilarity measure for boolean-valued
-    vectors. All nonzero entries will be treated as True, zero entries will
-    be treated as False.
-
-    """
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        cdef int tf1, tf2, ntt = 0, n_neq = 0
-        cdef np.intp_t j
-        for j in range(size):
-            tf1 = x1[j] != 0
-            tf2 = x2[j] != 0
-            n_neq += (tf1 != tf2)
-            ntt += (tf1 and tf2)
-        return n_neq / (0.5 * ntt + n_neq)
-
-
-#------------------------------------------------------------
-# Haversine Distance (2 dimensional)
-#  D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2)
-#                          + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]}
-cdef class HaversineDistance32(DistanceMetric32):
-    """Haversine (Spherical) Distance
-
-    The Haversine distance is the angular distance between two points on
-    the surface of a sphere.  The first distance of each point is assumed
-    to be the latitude, the second is the longitude, given in radians.
-    The dimension of the points must be 2:
-
-    """
-
-    def _validate_data(self, X):
-        if X.shape[1] != 2:
-            raise ValueError("Haversine distance only valid "
-                             "in 2 dimensions")
-
-    cdef inline DTYPE_t rdist(self, const np.float32_t* x1, const np.float32_t* x2,
-                              ITYPE_t size) nogil except -1:
-        cdef DTYPE_t sin_0 = <DTYPE_t> sin(0.5 * (x1[0] - x2[0]))
-        cdef DTYPE_t sin_1 = <DTYPE_t> sin(0.5 * (x1[1] - x2[1]))
-        return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
-
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return 2 * asin(sqrt(self.rdist(x1, x2, size)))
-
-    cdef inline DTYPE_t _rdist_to_dist(self, np.float32_t rdist) nogil except -1:
-        return 2 * asin(sqrt(rdist))
-
-    cdef inline DTYPE_t _dist_to_rdist(self, np.float32_t dist) nogil except -1:
-        cdef DTYPE_t tmp = <DTYPE_t> sin(0.5 * dist)
-        return tmp * tmp
-
-    def rdist_to_dist(self, rdist):
-        return 2 * np.arcsin(np.sqrt(rdist))
-
-    def dist_to_rdist(self, dist):
-        tmp = np.sin(0.5 * dist)
-        return tmp * tmp
-
-#------------------------------------------------------------
-# User-defined distance
-#
-cdef class PyFuncDistance32(DistanceMetric32):
-    """PyFunc Distance
-
-    A user-defined distance
-
-    Parameters
-    ----------
-    func : function
-        func should take two numpy arrays as input, and return a distance.
-    """
-    def __init__(self, func, **kwargs):
-        self.func = func
-        self.kwargs = kwargs
-
-    # in cython < 0.26, GIL was required to be acquired during definition of
-    # the function and inside the body of the function. This behaviour is not
-    # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
-    # only way to be back compatible is to inherit `dist` from the base class
-    # without GIL and called an inline `_dist` which acquire GIL.
-    cdef inline DTYPE_t dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                             ITYPE_t size) nogil except -1:
-        return self._dist(x1, x2, size)
-
-    cdef inline DTYPE_t _dist(self, const np.float32_t* x1, const np.float32_t* x2,
-                              ITYPE_t size) except -1 with gil:
-        cdef np.ndarray x1arr
-        cdef np.ndarray x2arr
-        x1arr = _buffer_to_ndarray32(x1, size)
-        x2arr = _buffer_to_ndarray32(x2, size)
-        d = self.func(x1arr, x2arr, **self.kwargs)
-        try:
-            # Cython generates code here that results in a TypeError
-            # if d is the wrong type.
-            return d
-        except TypeError:
-            raise TypeError("Custom distance function must accept two "
-                            "vectors and return a float.")
-
-
-######################################################################
-# Datasets Pair Classes
-cdef class DatasetsPair32:
-    """Abstract class which wraps a pair of datasets (X, Y).
-
-    This class allows computing distances between a single pair of rows of
-    of X and Y at a time given the pair of their indices (i, j). This class is
-    specialized for each metric thanks to the :func:`get_for` factory classmethod.
-
-    The handling of parallelization over chunks to compute the distances
-    and aggregation for several rows at a time is done in dedicated
-    subclasses of PairwiseDistancesReduction that in-turn rely on
-    subclasses of DatasetsPair for each pair of rows in the data. The goal
-    is to make it possible to decouple the generic parallelization and
-    aggregation logic from metric-specific computation as much as
-    possible.
-
-    X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
-    in subclasses.
-
-    This class avoids the overhead of dispatching distance computations
-    to :class:`sklearn.metrics.DistanceMetric` based on the physical
-    representation of the vectors (sparse vs. dense). It makes use of
-    cython.final to remove the overhead of dispatching method calls.
-
-    Parameters
-    ----------
-    distance_metric: DistanceMetric
-        The distance metric responsible for computing distances
-        between two vectors of (X, Y).
-    """
-
-    @classmethod
-    def get_for(
-        cls,
-        X,
-        Y,
-        str metric="euclidean",
-        dict metric_kwargs=None,
-    ) -> DatasetsPair32:
-        """Return the DatasetsPair implementation for the given arguments.
-
-        Parameters
-        ----------
-        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
-
-        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
-
-        metric : str, default='euclidean'
-            The distance metric to compute between rows of X and Y.
-            The default metric is a fast implementation of the Euclidean
-            metric. For a list of available metrics, see the documentation
-            of :class:`~sklearn.metrics.DistanceMetric`.
-
-        metric_kwargs : dict, default=None
-            Keyword arguments to pass to specified metric function.
-
-        Returns
-        -------
-        datasets_pair: DatasetsPair
-            The suited DatasetsPair implementation.
-        """
-        cdef:
-            DistanceMetric32 distance_metric = DistanceMetric32.get_metric(
-                metric,
-                **(metric_kwargs or {})
-            )
-
-        if not(X.dtype == Y.dtype == np.float32):
-            raise ValueError(
-                f"Datasets must be of np.float32 type. "
-                f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
-            )
-
-        # Metric-specific checks that do not replace nor duplicate `check_array`.
-        distance_metric._validate_data(X)
-        distance_metric._validate_data(Y)
-
-        # TODO: dispatch to other dataset pairs for sparse support once available:
-        if issparse(X) or issparse(Y):
-            raise ValueError("Only dense datasets are supported for X and Y.")
-
-        return DenseDenseDatasetsPair32(X, Y, distance_metric)
-
-    def __init__(self, DistanceMetric32 distance_metric):
-        self.distance_metric = distance_metric
-
-    cdef ITYPE_t n_samples_X(self) nogil:
-        """Number of samples in X."""
-        # This is a abstract method.
-        # This _must_ always be overwritten in subclasses.
-        # TODO: add "with gil: raise" here when supporting Cython 3.0
-        return -999
-
-    cdef ITYPE_t n_samples_Y(self) nogil:
-        """Number of samples in Y."""
-        # This is a abstract method.
-        # This _must_ always be overwritten in subclasses.
-        # TODO: add "with gil: raise" here when supporting Cython 3.0
-        return -999
-
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        return self.dist(i, j)
-
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        # This is a abstract method.
-        # This _must_ always be overwritten in subclasses.
-        # TODO: add "with gil: raise" here when supporting Cython 3.0
-        return -1
-
-@final
-cdef class DenseDenseDatasetsPair32(DatasetsPair32):
-    """Compute distances between row vectors of two arrays.
-
-    Parameters
-    ----------
-    X: ndarray of shape (n_samples_X, n_features)
-        Rows represent vectors. Must be C-contiguous.
-
-    Y: ndarray of shape (n_samples_Y, n_features)
-        Rows represent vectors. Must be C-contiguous.
-
-    distance_metric: DistanceMetric
-        The distance metric responsible for computing distances
-        between two row vectors of (X, Y).
-    """
-
-    def __init__(self, X, Y, DistanceMetric32 distance_metric):
-        super().__init__(distance_metric)
-        # Arrays have already been checked
-        self.X = X
-        self.Y = Y
-        self.d = X.shape[1]
-
-    @final
-    cdef ITYPE_t n_samples_X(self) nogil:
-        return self.X.shape[0]
-
-    @final
-    cdef ITYPE_t n_samples_Y(self) nogil:
-        return self.Y.shape[0]
-
-    @final
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        return self.distance_metric.rdist(&self.X[i, 0],
-                                          &self.Y[j, 0],
-                                          self.d)
-
-    @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        return self.distance_metric.dist(&self.X[i, 0],
-                                         &self.Y[j, 0],
-                                         self.d)
diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp
index 672e3fce086c4..d7210adf1b97a 100644
--- a/sklearn/metrics/_dist_metrics.pyx.tp
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -43,7 +43,7 @@ cdef extern from "arrayobject.h":
                                      int typenum, void* data)
 
 from scipy.sparse import csr_matrix, issparse
-from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DTYPECODE
 from ..utils._typedefs import DTYPE, ITYPE
 from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
 from ..utils import check_array
@@ -1155,10 +1155,11 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             raise TypeError("Custom distance function must accept two "
                             "vectors and return a float.")
 
+{{endfor}}
 
 ######################################################################
 # Datasets Pair Classes
-cdef class DatasetsPair{{name_suffix}}:
+cdef class DatasetsPair:
     """Abstract class which wraps a pair of datasets (X, Y).
 
     This class allows computing distances between a single pair of rows of
@@ -1195,7 +1196,7 @@ cdef class DatasetsPair{{name_suffix}}:
         Y,
         str metric="euclidean",
         dict metric_kwargs=None,
-    ) -> DatasetsPair{{name_suffix}}:
+    ) -> DatasetsPair:
         """Return the DatasetsPair implementation for the given arguments.
 
         Parameters
@@ -1225,14 +1226,14 @@ cdef class DatasetsPair{{name_suffix}}:
             The suited DatasetsPair implementation.
         """
         cdef:
-            DistanceMetric{{name_suffix}} distance_metric = DistanceMetric{{name_suffix}}.get_metric(
+            DistanceMetric distance_metric = DistanceMetric.get_metric(
                 metric,
                 **(metric_kwargs or {})
             )
 
-        if not(X.dtype == Y.dtype == np.float{{bitness}}):
+        if not(X.dtype == Y.dtype == np.float64):
             raise ValueError(
-                f"Datasets must be of np.float{{bitness}} type. "
+                f"Datasets must be of np.float64 type. "
                 f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
             )
 
@@ -1244,9 +1245,9 @@ cdef class DatasetsPair{{name_suffix}}:
         if issparse(X) or issparse(Y):
             raise ValueError("Only dense datasets are supported for X and Y.")
 
-        return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
+        return DenseDenseDatasetsPair(X, Y, distance_metric)
 
-    def __init__(self, DistanceMetric{{name_suffix}} distance_metric):
+    def __init__(self, DistanceMetric distance_metric):
         self.distance_metric = distance_metric
 
     cdef ITYPE_t n_samples_X(self) nogil:
@@ -1273,7 +1274,7 @@ cdef class DatasetsPair{{name_suffix}}:
         return -1
 
 @final
-cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+cdef class DenseDenseDatasetsPair(DatasetsPair):
     """Compute distances between row vectors of two arrays.
 
     Parameters
@@ -1289,7 +1290,7 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         between two row vectors of (X, Y).
     """
 
-    def __init__(self, X, Y, DistanceMetric{{name_suffix}} distance_metric):
+    def __init__(self, X, Y, DistanceMetric distance_metric):
         super().__init__(distance_metric)
         # Arrays have already been checked
         self.X = X
@@ -1315,5 +1316,3 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
         return self.distance_metric.dist(&self.X[i, 0],
                                          &self.Y[j, 0],
                                          self.d)
-
-{{endfor}}

From e8b8344faed61809a4e2b9977faf714ea13677ad Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Mar 2022 17:53:34 +0200
Subject: [PATCH 19/26] Fix typo

---
 sklearn/metrics/_dist_metrics.pxd.tp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp
index 8c9c960bd882e..2e3d847c77c93 100644
--- a/sklearn/metrics/_dist_metrics.pxd.tp
+++ b/sklearn/metrics/_dist_metrics.pxd.tp
@@ -17,7 +17,7 @@ implementation_specific_values = [
     # implementations.
     #
     # We also use 64bit types as defined in `sklearn.utils._typedefs`
-    # to maintain backward compatibility as the symbol level for extra
+    # to maintain backward compatibility at the symbol level for extra
     # safety.
     #
     ('', '64', 'DTYPE_t', 'DTYPE'),

From 2d104a4bc8ea1be9a822cbbebcd065fe4a1c6108 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 1 Apr 2022 17:10:57 +0200
Subject: [PATCH 20/26] TST Adapt error messages

---
 .../tests/test_pairwise_distances_reduction.py   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 192f7ef43a6c6..c9d7faa0bb0e3 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -130,8 +130,8 @@ def test_argkmin_factory_method_wrong_usages():
     metric = "euclidean"
 
     msg = (
-        "Only 64bit float datasets are supported at this time, "
-        "got: X.dtype=float32 and Y.dtype=float64"
+        "Datasets must be of np.float64 type. Currently: "
+        "X.dtype=float32 and Y.dtype=float64."
     )
     with pytest.raises(ValueError, match=msg):
         PairwiseDistancesArgKmin.compute(
@@ -139,8 +139,8 @@ def test_argkmin_factory_method_wrong_usages():
         )
 
     msg = (
-        "Only 64bit float datasets are supported at this time, "
-        "got: X.dtype=float64 and Y.dtype=int32"
+        "Datasets must be of np.float64 type. Currently: "
+        "X.dtype=float64 and Y.dtype=int32"
     )
     with pytest.raises(ValueError, match=msg):
         PairwiseDistancesArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
@@ -190,8 +190,8 @@ def test_radius_neighborhood_factory_method_wrong_usages():
     with pytest.raises(
         ValueError,
         match=(
-            "Only 64bit float datasets are supported at this time, "
-            "got: X.dtype=float32 and Y.dtype=float64"
+            "Datasets must be of np.float64 type. "
+            "Currently: X.dtype=float32 and Y.dtype=float64"
         ),
     ):
         PairwiseDistancesRadiusNeighborhood.compute(
@@ -201,8 +201,8 @@ def test_radius_neighborhood_factory_method_wrong_usages():
     with pytest.raises(
         ValueError,
         match=(
-            "Only 64bit float datasets are supported at this time, "
-            "got: X.dtype=float64 and Y.dtype=int32"
+            "Datasets must be of np.float64 type. "
+            "Currently: X.dtype=float64 and Y.dtype=int32"
         ),
     ):
         PairwiseDistancesRadiusNeighborhood.compute(

From 5d261b16efeae3eac902e10fdb06724dffee0c27 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 4 Apr 2022 14:36:33 +0200
Subject: [PATCH 21/26] MAINT Reorganise upcast w.r.t GEMMTermComputer
 introduction

---
 doc/whats_new/v1.1.rst                        |    1 -
 .../_pairwise_distances_reduction.pyx.tp      | 1383 +++++++++++++----
 .../test_pairwise_distances_reduction.py      |  232 ++-
 3 files changed, 1320 insertions(+), 296 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 35cab2a6dc5ab..224b0013d7ad9 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -120,7 +120,6 @@ Changelog
   can respectively be up to ×20 and ×5 faster than previously.
 
   :pr:`21987`, :pr:`22064`, :pr:`22065`, :pr:`22288` and :pr:`22590`.
-  :pr:`21987`, :pr:`22064`, :pr:`22065`, :pr:`22288` and :pr:`22320`
   by :user:`Julien Jerphanion <jjerphan>`
 
 - |Enhancement| All scikit-learn models now generate a more informative
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index 196e935f908c5..e93d7a2208be1 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -47,6 +47,8 @@ import warnings
 from .. import get_config
 from libc.stdlib cimport free, malloc
 from libc.float cimport DBL_MAX
+from libcpp.memory cimport shared_ptr, make_shared
+from libcpp.vector cimport vector
 from cython cimport final
 from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
@@ -61,11 +63,13 @@ from ..utils._cython_blas cimport (
   _dot,
   _gemm,
 )
-from ..utils._heap cimport simultaneous_sort, heap_push
+from ..utils._heap cimport heap_push
+from ..utils._sorting cimport simultaneous_sort
 from ..utils._openmp_helpers cimport _openmp_thread_num
 from ..utils._typedefs cimport ITYPE_t, DTYPE_t
+from ..utils._vector_sentinel cimport vector_to_nd_array
 
-from numbers import Integral
+from numbers import Integral, Real
 from typing import List
 from scipy.sparse import issparse
 from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
@@ -76,6 +80,68 @@ from ..utils._typedefs import ITYPE, DTYPE
 
 np.import_array()
 
+# TODO: change for `libcpp.algorithm.move` once Cython 3 is used
+# Introduction in Cython:
+# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa
+cdef extern from "<algorithm>" namespace "std" nogil:
+    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa
+
+######################
+## std::vector to np.ndarray coercion
+# As type covariance is not supported for C++ containers via Cython,
+# we need to redefine fused types.
+ctypedef fused vector_DITYPE_t:
+    vector[ITYPE_t]
+    vector[DTYPE_t]
+
+
+ctypedef fused vector_vector_DITYPE_t:
+    vector[vector[ITYPE_t]]
+    vector[vector[DTYPE_t]]
+
+
+cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
+    shared_ptr[vector_vector_DITYPE_t] vecs
+):
+    """Coerce a std::vector of std::vector to a ndarray of ndarray."""
+    cdef:
+        ITYPE_t n = deref(vecs).size()
+        np.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n,
+                                                                     dtype=np.ndarray)
+
+    for i in range(n):
+        nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
+
+    return nd_arrays_of_nd_arrays
+
+#####################
+
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
+    const DTYPE_t[:, ::1] X,
+    ITYPE_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        # See: https://github.com/scipy/scipy/issues/14262
+        DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
+        ITYPE_t idx = 0
+        ITYPE_t n = X.shape[0]
+        ITYPE_t d = X.shape[1]
+        DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
+
+    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
+
+    return squared_row_norms
+
+#####################
+
 cdef class PairwiseDistancesReduction:
     """Abstract base class for pairwise distance computation & reduction.
 
@@ -304,8 +370,9 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             This allows decoupling the interface entirely from the
             implementation details whilst maintaining RAII.
         """
-        if X.dtype == Y.dtype == np.float64:
-            return PairwiseDistancesArgKmin64.compute(
+{{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}}
+        if X.dtype == Y.dtype == np.float{{bitness}}:
+            return PairwiseDistancesArgKmin{{bitness}}.compute(
                 X=X,
                 Y=Y,
                 k=k,
@@ -315,23 +382,155 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 strategy=strategy,
                 return_distance=return_distance,
             )
-        if X.dtype == Y.dtype == np.float32:
-            return PairwiseDistancesArgKmin32.compute(
+{{endfor}}
+        raise ValueError(
+            f"Datasets must both be of np.float64 or np.float32 dtype. "
+            f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
+
+cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
+    """Compute radius-based neighbors for two sets of vectors.
+
+    For each row-vector X[i] of the queries X, find all the indices j of
+    row-vectors in Y such that:
+
+                        dist(X[i], Y[j]) <= radius
+
+    The distance function `dist` depends on the values of the `metric`
+    and `metric_kwargs` parameters.
+
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The dataset pair (X, Y) for the reduction.
+
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use 256 if it is not set.
+
+    radius: float
+        The radius defining the neighborhood.
+    """
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        DTYPE_t radius,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+        bint return_distance=False,
+        bint sort_results=False,
+    ):
+        """Return the results of the reduction for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        radius : float
+            The radius defining the neighborhood.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+              Each thread then iterates on all the chunks of Y. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+              Each thread processes all the chunks of X in turn. This strategy is
+              a sequence of embarrassingly parallel subtasks (the inner loop on Y
+              chunks) with intermediate datastructures synchronisation at each
+              iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+              'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+              'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]`
+              is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity
+              for parallelism and is therefore more efficient despite the synchronization
+              step at each iteration of the outer loop on chunks of `X`.
+
+              - None (default) looks-up in scikit-learn configuration for
+              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its neighbors if set to True.
+
+        sort_results : boolean, default=False
+            Sort results with respect to distances between each X vector and its
+            neighbors if set to True.
+
+        Returns
+        -------
+        If return_distance=False:
+          - neighbors_indices : ndarray of n_samples_X ndarray
+            Indices of the neighbors for each vector in X.
+
+        If return_distance=True:
+          - neighbors_indices : ndarray of n_samples_X ndarray
+            Indices of the neighbors for each vector in X.
+          - neighbors_distances : ndarray of n_samples_X ndarray
+            Distances to the neighbors for each vector in X.
+
+        Notes
+        -----
+        This public classmethod is responsible for introspecting the arguments
+        values to dispatch to the private
+        :meth:`PairwiseDistancesRadiusNeighborhood._compute` instance method of
+        the most appropriate :class:`PairwiseDistancesRadiusNeighborhood`
+        concrete implementation.
+
+        All temporarily allocated datastructures necessary for the concrete
+        implementation are therefore freed when this classmethod returns.
+
+        This allows entirely decoupling the interface entirely from the
+        implementation details whilst maintaining RAII.
+        """
+{{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}}
+        if X.dtype == Y.dtype == np.float{{bitness}}:
+            return PairwiseDistancesRadiusNeighborhood{{bitness}}.compute(
                 X=X,
                 Y=Y,
-                k=k,
+                radius=radius,
                 metric=metric,
                 chunk_size=chunk_size,
                 metric_kwargs=metric_kwargs,
                 strategy=strategy,
                 return_distance=return_distance,
             )
+{{endfor}}
         raise ValueError(
-            "No implementation exist for fused-typed datasets pair. "
-            f"Currently X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+            f"Datasets must both be of np.float64 or np.float32 dtype. "
+            f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
         )
 
-
 cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
     const DTYPE_t[:, ::1] X,
     ITYPE_t num_threads,
@@ -394,92 +593,296 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32(
 
     return squared_row_norms
 
+# dtype-specific implementations.
 
 {{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}}
 
 from ._dist_metrics cimport DatasetsPair{{distance_suffix}}, DenseDenseDatasetsPair{{distance_suffix}}
 
 
-cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
-    """{{bitness}}bit implementation of PairwiseDistancesReduction."""
+cdef class GEMMTermComputer{{bitness}}:
+    """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM.
+
+    `FastEuclidean*` classes internally compute the squared Euclidean distances between
+    chunks of vectors X_c and Y_c using the following decomposition:
+
+
+                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
 
+
+    This helper class is in charge of wrapping the common logic to compute
+    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
+    arithmetic intensity.
+    """
     cdef:
-        readonly DatasetsPair{{distance_suffix}} datasets_pair
+        const {{DTYPE_t}}[:, ::1] X
+        const {{DTYPE_t}}[:, ::1] Y
 
-        # The number of threads that can be used is stored in effective_n_threads.
-        #
-        # The number of threads to use in the parallelisation strategy
-        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
-        # for small datasets, less threads might be needed to loop over pair of chunks.
-        #
-        # Hence the number of threads that _will_ be used for looping over chunks
-        # is stored in chunks_n_threads, allowing solely using what we need.
-        #
-        # Thus, an invariant is:
-        #
-        #                 chunks_n_threads <= effective_n_threads
-        #
         ITYPE_t effective_n_threads
         ITYPE_t chunks_n_threads
+        ITYPE_t dist_middle_terms_chunks_size
+        ITYPE_t n_features
 
-        ITYPE_t n_samples_chunk, chunk_size
+        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
+        vector[vector[DTYPE_t]] dist_middle_terms_chunks
 
-        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
-        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
+{{if need_upcast}}
+        DTYPE_t** X_c_upcast
+        DTYPE_t** Y_c_upcast
+{{endif}}
 
-        bint execute_in_parallel_on_Y
+    def __init__(self,
+        {{DTYPE_t}}[:, ::1] X,
+        {{DTYPE_t}}[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+    ):
+        self.X = X
+        self.Y = Y
+        self.effective_n_threads = effective_n_threads
+        self.chunks_n_threads = chunks_n_threads
+        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
+        self.n_features = n_features
 
-    def __init__(
-        self,
-        DatasetsPair{{distance_suffix}} datasets_pair,
-        chunk_size=None,
-        strategy=None,
-     ):
-        cdef:
-            ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks
+        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
 
-        if chunk_size is None:
-            chunk_size = get_config().get("pairwise_dist_chunk_size", 256)
+{{if need_upcast}}
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
+        for thread_num in range(self.effective_n_threads):
+            self.X_c_upcast[thread_num] = <DTYPE_t *> malloc(
+                self.dist_middle_terms_chunks_size * sizeof(DTYPE_t)
+            )
+            self.Y_c_upcast[thread_num] = <DTYPE_t *> malloc(
+                self.dist_middle_terms_chunks_size * sizeof(DTYPE_t)
+            )
 
-        self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
+    def __dealloc_(self):
+        if self.X_c_upcast is not NULL:
+            free(self.X_c_upcast)
+        if self.Y_c_upcast is not NULL:
+            free(self.Y_c_upcast)
+{{endif}}
 
-        self.effective_n_threads = _openmp_effective_n_threads()
 
-        self.datasets_pair = datasets_pair
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
 
-        self.n_samples_X = datasets_pair.n_samples_X()
-        self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size)
-        X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
-        X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
-        self.X_n_chunks = X_n_full_chunks + (X_n_samples_remainder != 0)
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+{{else}}
+        return
+{{endif}}
 
-        if X_n_samples_remainder != 0:
-            self.X_n_samples_last_chunk = X_n_samples_remainder
-        else:
-            self.X_n_samples_last_chunk = self.X_n_samples_chunk
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
+        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
 
-        self.n_samples_Y = datasets_pair.n_samples_Y()
-        self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size)
-        Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
-        Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
-        self.Y_n_chunks = Y_n_full_chunks + (Y_n_samples_remainder != 0)
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
 
-        if Y_n_samples_remainder != 0:
-            self.Y_n_samples_last_chunk = Y_n_samples_remainder
-        else:
-            self.Y_n_samples_last_chunk = self.Y_n_samples_chunk
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+{{else}}
+        return
+{{endif}}
 
-        if strategy is None:
-            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+    cdef void _parallel_on_Y_init(self) nogil:
+        for thread_num in range(self.chunks_n_threads):
+            self.dist_middle_terms_chunks[thread_num].resize(
+                self.dist_middle_terms_chunks_size
+            )
 
-        if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'):
-            raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
-                               f"or 'auto', but currently strategy='{self.strategy}'.")
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
 
-        if strategy == 'auto':
-            # This is a simple heuristic whose constant for the
-            # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X:
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef DTYPE_t * _compute_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t squared_dist_i_j
+            const {{DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :]
+            const {{DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
+
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+{{if need_upcast}}
+            DTYPE_t * A = self.X_c_upcast[thread_num]
+            DTYPE_t * B = self.Y_c_upcast[thread_num]
+{{else}}
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            # See: https://github.com/scipy/scipy/issues/14262
+            DTYPE_t * A = <DTYPE_t *> &X_c[0, 0]
+            DTYPE_t * B = <DTYPE_t *> &Y_c[0, 0]
+{{endif}}
+            ITYPE_t lda = X_c.shape[1]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            ITYPE_t ldc = Y_c.shape[0]
+
+        # dist_middle_terms = `-2 * X_c @ Y_c.T`
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
+
+        return dist_middle_terms
+
+
+cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
+    """{{bitness}}bit implementation of PairwiseDistancesReduction."""
+
+    cdef:
+        readonly DatasetsPair{{distance_suffix}} datasets_pair
+
+        # The number of threads that can be used is stored in effective_n_threads.
+        #
+        # The number of threads to use in the parallelisation strategy
+        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
+        # for small datasets, less threads might be needed to loop over pair of chunks.
+        #
+        # Hence the number of threads that _will_ be used for looping over chunks
+        # is stored in chunks_n_threads, allowing solely using what we need.
+        #
+        # Thus, an invariant is:
+        #
+        #                 chunks_n_threads <= effective_n_threads
+        #
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+
+        ITYPE_t n_samples_chunk, chunk_size
+
+        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
+        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
+
+        bint execute_in_parallel_on_Y
+
+    def __init__(
+        self,
+        DatasetsPair{{distance_suffix}} datasets_pair,
+        chunk_size=None,
+        strategy=None,
+     ):
+        cdef:
+            ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks
+
+        if chunk_size is None:
+            chunk_size = get_config().get("pairwise_dist_chunk_size", 256)
+
+        self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
+
+        self.effective_n_threads = _openmp_effective_n_threads()
+
+        self.datasets_pair = datasets_pair
+
+        self.n_samples_X = datasets_pair.n_samples_X()
+        self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size)
+        X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
+        X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
+        self.X_n_chunks = X_n_full_chunks + (X_n_samples_remainder != 0)
+
+        if X_n_samples_remainder != 0:
+            self.X_n_samples_last_chunk = X_n_samples_remainder
+        else:
+            self.X_n_samples_last_chunk = self.X_n_samples_chunk
+
+        self.n_samples_Y = datasets_pair.n_samples_Y()
+        self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size)
+        Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
+        Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
+        self.Y_n_chunks = Y_n_full_chunks + (Y_n_samples_remainder != 0)
+
+        if Y_n_samples_remainder != 0:
+            self.Y_n_samples_last_chunk = Y_n_samples_remainder
+        else:
+            self.Y_n_samples_last_chunk = self.Y_n_samples_chunk
+
+        if strategy is None:
+            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+
+        if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'):
+            raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
+                               f"or 'auto', but currently strategy='{self.strategy}'.")
+
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            if 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X:
                 strategy = 'parallel_on_X'
             else:
                 strategy = 'parallel_on_Y'
@@ -912,7 +1315,7 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
         #
         # For the sake of explicitness:
         #   - when parallelizing on X, the pointers of those heaps are referencing
-        #   (with proper offsets) addresses of the two main heaps (see bellow)
+        #   (with proper offsets) addresses of the two main heaps (see below)
         #   - when parallelizing on Y, the pointers of those heaps are referencing
         #   small heaps which are thread-wise-allocated and whose content will be
         #   merged with the main heaps'.
@@ -961,10 +1364,6 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
                     Y_start + j,
                 )
 
-{{if need_upcast}}
-{{else}}
-    @final
-{{endif}}
     cdef void _parallel_on_X_init_chunk(
         self,
         ITYPE_t thread_num,
@@ -1020,10 +1419,6 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
                 heaps_size * sizeof(ITYPE_t)
             )
 
-{{if need_upcast}}
-{{else}}
-    @final
-{{endif}}
     cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t thread_num,
@@ -1112,38 +1507,15 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
 
 
 cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArgKmin{{bitness}}):
-    """Fast specialized alternative for PairwiseDistancesArgKmin{{bitness}} on EuclideanDistance.
-
-    The full pairwise squared distances matrix is computed as follows:
-
-                  ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||²
-
-    The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
-
-    Notes
-    -----
-    This implementation has a superior arithmetic intensity and hence
-    better running time when the alternative is IO bound, but it can suffer
-    from numerical instability caused by catastrophic cancellation potentially
-    introduced by the subtraction in the arithmetic expression above.
-    """
-
+    """Fast specialized alternative for PairwiseDistancesArgKmin{{bitness}} on EuclideanDistance."""
     cdef:
-        const {{DTYPE_t}}[:, ::1] X
-        const {{DTYPE_t}}[:, ::1] Y
+        GEMMTermComputer{{bitness}} gemm_term_computer
+
         const DTYPE_t[::1] X_norm_squared
         const DTYPE_t[::1] Y_norm_squared
 
-        # Buffers for GEMM
-        DTYPE_t ** dist_middle_terms_chunks
         bint use_squared_distances
 
-{{if need_upcast}}
-        # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
-        DTYPE_t ** X_c_upcast
-        DTYPE_t ** Y_c_upcast
-{{endif}}
-
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
         return (PairwiseDistancesArgKmin{{bitness}}.is_usable_for(X, Y, metric) and
@@ -1165,7 +1537,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
             "Y_norm_squared" not in metric_kwargs
         ):
             warnings.warn(
-                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't"
+                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
                 f"usable for this case ({self.__class__.__name__}) and will be ignored.",
                 UserWarning,
                 stacklevel=3,
@@ -1183,45 +1555,29 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
             DenseDenseDatasetsPair{{distance_suffix}} datasets_pair = (
             <DenseDenseDatasetsPair{{distance_suffix}}> self.datasets_pair
         )
-        self.X, self.Y = datasets_pair.X, datasets_pair.Y
+            ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
+
+        self.gemm_term_computer = GEMMTermComputer{{bitness}}(
+            datasets_pair.X,
+            datasets_pair.Y,
+            self.effective_n_threads,
+            self.chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features=datasets_pair.X.shape[1]
+        )
 
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared")
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms{{bitness}}(self.Y, self.effective_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms{{bitness}}(datasets_pair.Y, self.effective_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms{{bitness}}(self.X, self.effective_n_threads)
+            _sqeuclidean_row_norms{{bitness}}(datasets_pair.X, self.effective_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 
-        # Temporary datastructures used in threads
-        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.chunks_n_threads
-        )
-
-{{if need_upcast}}
-        # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
-        self.X_c_upcast = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.chunks_n_threads
-        )
-        self.Y_c_upcast = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.chunks_n_threads
-        )
-{{endif}}
-
-    def __dealloc__(self):
-        if self.dist_middle_terms_chunks is not NULL:
-            free(self.dist_middle_terms_chunks)
-{{if need_upcast}}
-        if self.X_c_upcast is not NULL:
-            free(self.X_c_upcast)
-        if self.Y_c_upcast is not NULL:
-            free(self.Y_c_upcast)
-{{endif}}
-
     @final
     cdef void compute_exact_distances(self) nogil:
         if not self.use_squared_distances:
@@ -1233,22 +1589,9 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         ITYPE_t thread_num,
     ) nogil:
         PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_parallel_init(self, thread_num)
+        self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
 
-        # Temporary buffer for the `-2 * X_c @ Y_c.T` term
-        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
-        )
-{{if need_upcast}}
-        # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
-        self.X_c_upcast[thread_num] = <DTYPE_t *> malloc(
-            self.X_n_samples_chunk * self.X.shape[1] * sizeof(DTYPE_t)
-        )
-        self.Y_c_upcast[thread_num] = <DTYPE_t *> malloc(
-            self.Y_n_samples_chunk * self.Y.shape[1] * sizeof(DTYPE_t)
-        )
-{{endif}}
 
-{{if need_upcast}}
     @final
     cdef void _parallel_on_X_init_chunk(
         self,
@@ -1256,16 +1599,9 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
-        cdef:
-            ITYPE_t n_features = self.X.shape[1]
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = X_end - X_start
         PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
 
-        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(n_features):
-                self.X_c_upcast[thread_num][i * n_features + j] = <DTYPE_t> self.X[X_start + i, j]
 
     @final
     cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
@@ -1276,34 +1612,16 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         ITYPE_t Y_end,
         ITYPE_t thread_num,
     ) nogil:
-        cdef:
-            ITYPE_t n_features = self.Y.shape[1]
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = Y_end - Y_start
         PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
             self,
             X_start, X_end,
             Y_start, Y_end,
             thread_num,
         )
+        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num,
+        )
 
-        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(n_features):
-                self.Y_c_upcast[thread_num][i * n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
-{{endif}}
-
-    @final
-    cdef void _parallel_on_X_parallel_finalize(
-        self,
-        ITYPE_t thread_num
-    ) nogil:
-        PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_parallel_finalize(self, thread_num)
-        free(self.dist_middle_terms_chunks[thread_num])
-{{if need_upcast}}
-        free(self.X_c_upcast[thread_num])
-        free(self.Y_c_upcast[thread_num])
-{{endif}}
 
     @final
     cdef void _parallel_on_Y_init(
@@ -1311,23 +1629,9 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
     ) nogil:
         cdef ITYPE_t thread_num
         PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_init(self)
+        self.gemm_term_computer._parallel_on_Y_init()
 
-        for thread_num in range(self.chunks_n_threads):
-            # Temporary buffer for the `-2 * X_c @ Y_c.T` term
-            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
-                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
-            )
-{{if need_upcast}}
-            # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
-            self.X_c_upcast[thread_num] = <DTYPE_t *> malloc(
-                self.X_n_samples_chunk * self.X.shape[1] * sizeof(DTYPE_t)
-            )
-            self.Y_c_upcast[thread_num] = <DTYPE_t *> malloc(
-                self.Y_n_samples_chunk * self.Y.shape[1] * sizeof(DTYPE_t)
-            )
-{{endif}}
 
-{{if need_upcast}}
     @final
     cdef void _parallel_on_Y_parallel_init(
         self,
@@ -1335,16 +1639,9 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         ITYPE_t X_start,
         ITYPE_t X_end,
     ) nogil:
-        cdef:
-            ITYPE_t n_features = self.X.shape[1]
-            ITYPE_t n_chunk_samples = X_end - X_start
-            ITYPE_t i, j
         PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
 
-        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(n_features):
-                self.X_c_upcast[thread_num][i * n_features + j] = <DTYPE_t> self.X[X_start + i, j]
 
     @final
     cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
@@ -1355,37 +1652,16 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
         ITYPE_t Y_end,
         ITYPE_t thread_num,
     ) nogil:
-        cdef:
-            ITYPE_t n_features = self.Y.shape[1]
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = Y_end - Y_start
         PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
             self,
             X_start, X_end,
             Y_start, Y_end,
             thread_num,
         )
+        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num
+        )
 
-        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(n_features):
-                self.Y_c_upcast[thread_num][i * n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
-{{endif}}
-
-
-    @final
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_finalize(self)
-
-        for thread_num in range(self.chunks_n_threads):
-            free(self.dist_middle_terms_chunks[thread_num])
-{{if need_upcast}}
-            free(self.X_c_upcast[thread_num])
-            free(self.Y_c_upcast[thread_num])
-{{endif}}
 
     @final
     cdef void _compute_and_reduce_distances_on_chunks(
@@ -1398,48 +1674,19 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
     ) nogil:
         cdef:
             ITYPE_t i, j
-
-            const {{DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :]
-            const {{DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
-            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
-            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
-
-            # Careful: LDA, LDB and LDC are given for F-ordered arrays
-            # in BLAS documentations, for instance:
-            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
-            #
-            # Here, we use their counterpart values to work with C-ordered arrays.
-            BLAS_Order order = RowMajor
-            BLAS_Trans ta = NoTrans
-            BLAS_Trans tb = Trans
-            ITYPE_t m = X_c.shape[0]
-            ITYPE_t n = Y_c.shape[0]
-            ITYPE_t K = X_c.shape[1]
-            DTYPE_t alpha = - 2.
-            ITYPE_t lda = X_c.shape[1]
-            ITYPE_t ldb = X_c.shape[1]
-            DTYPE_t beta = 0.
-            ITYPE_t ldc = Y_c.shape[0]
-{{if need_upcast}}
-            # Those two buffers have been upcast from 32bit to 64bit previously.
-            DTYPE_t * A = self.X_c_upcast[thread_num]
-            DTYPE_t * B = self.Y_c_upcast[thread_num]
-{{else}}
-            # Casting for A and B to remove the const is needed because APIs exposed via
-            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
-            # See: https://github.com/scipy/scipy/issues/14262
-            DTYPE_t * A = <{{DTYPE_t}} *> &X_c[0, 0]
-            DTYPE_t * B = <{{DTYPE_t}} *> &Y_c[0, 0]
-{{endif}}
-
-        # dist_middle_terms = `-2 * X_c @ Y_c.T`
-        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
+            DTYPE_t squared_dist_i_j
+            ITYPE_t n_X = X_end - X_start
+            ITYPE_t n_Y = Y_end - Y_start
+            DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks(
+                X_start, X_end, Y_start, Y_end, thread_num
+            )
+            DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            ITYPE_t * heaps_indices = self.heaps_indices_chunks[thread_num]
 
         # Pushing the distance and their associated indices on heaps
         # which keep tracks of the argkmin.
-        for i in range(X_c.shape[0]):
-            for j in range(Y_c.shape[0]):
+        for i in range(n_X):
+            for j in range(n_Y):
                 heap_push(
                     heaps_r_distances + i * self.k,
                     heaps_indices + i * self.k,
@@ -1450,10 +1697,618 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
                     #
                     (
                         self.X_norm_squared[i + X_start] +
-                        dist_middle_terms[i * Y_c.shape[0] + j] +
+                        dist_middle_terms[i * n_Y + j] +
                         self.Y_norm_squared[j + Y_start]
                     ),
                     j + Y_start,
                 )
 
+
+cdef class PairwiseDistancesRadiusNeighborhood{{bitness}}(PairwiseDistancesReduction{{bitness}}):
+    """Compute radius-based neighbors for two sets of vectors.
+
+    For each row-vector X[i] of the queries X, find all the indices j of
+    row-vectors in Y such that:
+
+                        dist(X[i], Y[j]) <= radius
+
+    The distance function `dist` depends on the values of the `metric`
+    and `metric_kwargs` parameters.
+
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The dataset pair (X, Y) for the reduction.
+
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use 256 if it is not set.
+
+    radius: float
+        The radius defining the neighborhood.
+    """
+
+    cdef:
+        DTYPE_t radius
+
+        # DistanceMetric compute rank-preserving surrogate distance via rdist
+        # which are proxies necessitating less computations.
+        # We get the equivalent for the radius to be able to compare it against
+        # vectors' rank-preserving surrogate distances.
+        DTYPE_t r_radius
+
+        # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays.
+        #
+        # For this implementation, we want resizable buffers which we will wrap
+        # into numpy arrays at the end. std::vector comes as a handy interface
+        # for interacting efficiently with resizable buffers.
+        #
+        # Though it is possible to access their buffer address with
+        # std::vector::data, they can't be stolen: buffers lifetime
+        # is tied to their std::vector and are deallocated when
+        # std::vectors are.
+        #
+        # To solve this, we dynamically allocate std::vectors and then
+        # encapsulate them in a StdVectorSentinel responsible for
+        # freeing them when the associated np.ndarray is freed.
+        #
+        # Shared pointers (defined via shared_ptr) are use for safer memory management.
+        # Unique pointers (defined via unique_ptr) can't be used as datastructures
+        # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk.
+        shared_ptr[vector[vector[ITYPE_t]]] neigh_indices
+        shared_ptr[vector[vector[DTYPE_t]]] neigh_distances
+
+        # Used as array of pointers to private datastructures used in threads.
+        vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks
+        vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks
+
+        bint sort_results
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        DTYPE_t radius,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+        bint return_distance=False,
+        bint sort_results=False,
+    ):
+        """Return the results of the reduction for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        radius : float
+            The radius defining the neighborhood.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+              Each thread then iterates on all the chunks of Y. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+              Each thread processes all the chunks of X in turn. This strategy is
+              a sequence of embarrassingly parallel subtasks (the inner loop on Y
+              chunks) with intermediate datastructures synchronisation at each
+              iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+              'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+              'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]`
+              is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity
+              for parallelism and is therefore more efficient despite the synchronization
+              step at each iteration of the outer loop on chunks of `X`.
+
+              - None (default) looks-up in scikit-learn configuration for
+              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its neighbors if set to True.
+
+        sort_results : boolean, default=False
+            Sort results with respect to distances between each X vector and its
+            neighbors if set to True.
+
+        Returns
+        -------
+        If return_distance=False:
+          - neighbors_indices : ndarray of n_samples_X ndarray
+            Indices of the neighbors for each vector in X.
+
+        If return_distance=True:
+          - neighbors_indices : ndarray of n_samples_X ndarray
+            Indices of the neighbors for each vector in X.
+          - neighbors_distances : ndarray of n_samples_X ndarray
+            Distances to the neighbors for each vector in X.
+
+        Notes
+        -----
+        This public classmethod is responsible for introspecting the arguments
+        values to dispatch to the private
+        :meth:`PairwiseDistancesRadiusNeighborhood._compute` instance method of
+        the most appropriate :class:`PairwiseDistancesRadiusNeighborhood`
+        concrete implementation.
+
+        All temporarily allocated datastructures necessary for the concrete
+        implementation are therefore freed when this classmethod returns.
+
+        This allows entirely decoupling the interface entirely from the
+        implementation details whilst maintaining RAII.
+        """
+        # Note (jjerphan): Some design thoughts for future extensions.
+        # This factory comes to handle specialisations for the given arguments.
+        # For future work, this might can be an entrypoint to specialise operations
+        # for various backend and/or hardware and/or datatypes, and/or fused
+        # {sparse, dense}-datasetspair etc.
+        if (
+            metric in ("euclidean", "sqeuclidean")
+            and not issparse(X)
+            and not issparse(Y)
+        ):
+            # Specialized implementation with improved arithmetic intensity
+            # and vector instructions (SIMD) by processing several vectors
+            # at time to leverage a call to the BLAS GEMM routine as explained
+            # in more details in the docstring.
+            use_squared_distances = metric == "sqeuclidean"
+            pda = FastEuclideanPairwiseDistancesRadiusNeighborhood{{bitness}}(
+                X=X, Y=Y, radius=radius,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                sort_results=sort_results,
+            )
+        else:
+             # Fall back on a generic implementation that handles most scipy
+             # metrics by computing the distances between 2 vectors at a time.
+            pda = PairwiseDistancesRadiusNeighborhood{{bitness}}(
+                datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric, metric_kwargs),
+                radius=radius,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                sort_results=sort_results,
+            )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results(return_distance)
+
+
+    def __init__(
+        self,
+        DatasetsPair{{distance_suffix}} datasets_pair,
+        DTYPE_t radius,
+        chunk_size=None,
+        strategy=None,
+        sort_results=False,
+        metric_kwargs=None,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+        )
+
+        self.radius = check_scalar(radius, "radius", Real, min_val=0)
+        self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
+        self.sort_results = sort_results
+
+        # Allocating pointers to datastructures but not the datastructures themselves.
+        # There are as many pointers as effective threads.
+        #
+        # For the sake of explicitness:
+        #   - when parallelizing on X, the pointers of those heaps are referencing
+        #   self.neigh_distances and self.neigh_indices
+        #   - when parallelizing on Y, the pointers of those heaps are referencing
+        #   std::vectors of std::vectors which are thread-wise-allocated and whose
+        #   content will be merged into self.neigh_distances and self.neigh_indices.
+        self.neigh_distances_chunks = vector[shared_ptr[vector[vector[DTYPE_t]]]](
+            self.chunks_n_threads
+        )
+        self.neigh_indices_chunks = vector[shared_ptr[vector[vector[ITYPE_t]]]](
+            self.chunks_n_threads
+        )
+
+        # Temporary datastructures which will be coerced to numpy arrays on before
+        # PairwiseDistancesRadiusNeighborhood.compute "return" and will be then freed.
+        self.neigh_distances = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X)
+        self.neigh_indices = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X)
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t r_dist_i_j
+
+        for i in range(X_start, X_end):
+            for j in range(Y_start, Y_end):
+                r_dist_i_j = self.datasets_pair.surrogate_dist(i, j)
+                if r_dist_i_j <= self.r_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
+
+    def _finalize_results(self, bint return_distance=False):
+        if return_distance:
+            # We need to recompute distances because we relied on
+            # surrogate distances for the reduction.
+            self.compute_exact_distances()
+            return (
+                coerce_vectors_to_nd_arrays(self.neigh_distances),
+                coerce_vectors_to_nd_arrays(self.neigh_indices),
+            )
+
+        return coerce_vectors_to_nd_arrays(self.neigh_indices)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+
+        # As this strategy is embarrassingly parallel, we can set the
+        # thread vectors' pointers to the main vectors'.
+        self.neigh_distances_chunks[thread_num] = self.neigh_distances
+        self.neigh_indices_chunks[thread_num] = self.neigh_indices
+
+    @final
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx
+
+        # Sorting neighbors for each query vector of X
+        if self.sort_results:
+            for idx in range(X_start, X_end):
+                simultaneous_sort(
+                    deref(self.neigh_distances)[idx].data(),
+                    deref(self.neigh_indices)[idx].data(),
+                    deref(self.neigh_indices)[idx].size()
+                )
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t thread_num
+        # As chunks of X are shared across threads, so must datastructures to avoid race
+        # conditions: each thread has its own vectors of n_samples_X vectors which are
+        # then merged back in the main n_samples_X vectors.
+        for thread_num in range(self.chunks_n_threads):
+            self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X)
+            self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X)
+
+    @final
+    cdef void _merge_vectors(
+        self,
+        ITYPE_t idx,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef:
+            ITYPE_t thread_num
+            ITYPE_t idx_n_elements = 0
+            ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size()
+
+        # Resizing buffers only once for the given number of elements.
+        for thread_num in range(num_threads):
+            idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
+
+        deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
+        deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
+
+        # Moving the elements by range using the range first element
+        # as the reference for the insertion.
+        for thread_num in range(num_threads):
+            move(
+                deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
+                deref(self.neigh_distances_chunks[thread_num])[idx].end(),
+                deref(self.neigh_distances)[idx].begin() + last_element_idx
+            )
+            move(
+                deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
+                deref(self.neigh_indices_chunks[thread_num])[idx].end(),
+                deref(self.neigh_indices)[idx].begin() + last_element_idx
+            )
+            last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
+
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
+
+        with nogil, parallel(num_threads=self.effective_n_threads):
+            # Merge vectors used in threads into the main ones.
+            # This is done in parallel sample-wise (no need for locks)
+            # using dynamic scheduling because we might not have
+            # the same number of neighbors for each query vector.
+            # TODO: compare 'dynamic' vs 'static' vs 'guided'
+            for idx in prange(self.n_samples_X, schedule='dynamic'):
+                self._merge_vectors(idx, self.chunks_n_threads)
+
+            # The content of the vector have been std::moved.
+            # Hence they can't be used anymore and can be deleted.
+            # Their deletion is carried out automatically as the
+            # implementation relies on shared pointers.
+
+            # Sort in parallel in ascending order w.r.t the distances if requested.
+            if self.sort_results:
+                for idx in prange(self.n_samples_X, schedule='static'):
+                    simultaneous_sort(
+                        deref(self.neigh_distances)[idx].data(),
+                        deref(self.neigh_indices)[idx].data(),
+                        deref(self.neigh_indices)[idx].size()
+                    )
+
+        return
+
+    cdef void compute_exact_distances(self) nogil:
+        """Convert rank-preserving distances to pairwise distances in parallel."""
+        cdef:
+            ITYPE_t i, j
+
+        for i in prange(self.n_samples_X, nogil=True, schedule='dynamic',
+                        num_threads=self.effective_n_threads):
+            for j in range(deref(self.neigh_indices)[i].size()):
+                deref(self.neigh_distances)[i][j] = (
+                        self.datasets_pair.distance_metric._rdist_to_dist(
+                            # Guard against eventual -0., causing nan production.
+                            max(deref(self.neigh_distances)[i][j], 0.)
+                        )
+                )
+
+
+cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood{{bitness}}(PairwiseDistancesRadiusNeighborhood{{bitness}}):
+    """Fast specialized variant for PairwiseDistancesRadiusNeighborhood on EuclideanDistance.
+
+    The full pairwise squared distances matrix is computed as follows:
+
+                  ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||²
+
+    The middle term gets computed efficiently below using BLAS Level 3 GEMM.
+
+    Notes
+    -----
+    This implementation has a superior arithmetic intensity and hence
+    better running time when the variant is IO bound, but it can suffer
+    from numerical instability caused by catastrophic cancellation potentially
+    introduced by the subtraction in the arithmetic expression above.
+    numerical precision is needed.
+    """
+
+    cdef:
+        GEMMTermComputer{{bitness}} gemm_term_computer
+        const DTYPE_t[::1] X_norm_squared
+        const DTYPE_t[::1] Y_norm_squared
+
+        bint use_squared_distances
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return (PairwiseDistancesRadiusNeighborhood{{bitness}}.is_usable_for(X, Y, metric)
+                and not _in_unstable_openblas_configuration())
+
+    def __init__(
+        self,
+        X,
+        Y,
+        DTYPE_t radius,
+        bint use_squared_distances=False,
+        chunk_size=None,
+        strategy=None,
+        sort_results=False,
+        metric_kwargs=None,
+    ):
+        if (
+            metric_kwargs is not None and
+            len(metric_kwargs) > 0 and
+            "Y_norm_squared" not in metric_kwargs
+        ):
+            warnings.warn(
+                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
+                f"usable for this case ({self.__class__.__name__}) and will be ignored.",
+                UserWarning,
+                stacklevel=3,
+            )
+
+        super().__init__(
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric="euclidean"),
+            radius=radius,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            sort_results=sort_results,
+            metric_kwargs=metric_kwargs,
+        )
+        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        cdef:
+            DenseDenseDatasetsPair{{distance_suffix}} datasets_pair = <DenseDenseDatasetsPair{{distance_suffix}}> self.datasets_pair
+            ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
+
+        self.gemm_term_computer = GEMMTermComputer{{bitness}}(
+            datasets_pair.X,
+            datasets_pair.Y,
+            self.effective_n_threads,
+            self.chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features=datasets_pair.X.shape[1]
+        )
+
+        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
+            self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared")
+        else:
+            self.Y_norm_squared = _sqeuclidean_row_norms{{bitness}}(datasets_pair.Y, self.effective_n_threads)
+
+        # Do not recompute norms if datasets are identical.
+        self.X_norm_squared = (
+            self.Y_norm_squared if X is Y else
+            _sqeuclidean_row_norms{{bitness}}(datasets_pair.X, self.effective_n_threads)
+        )
+        self.use_squared_distances = use_squared_distances
+
+        if use_squared_distances:
+            # In this specialisation and this setup, the value passed to the radius is
+            # already considered to be the adapted radius, so we overwrite it.
+            self.r_radius = radius
+
+    @final
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_X_parallel_init(self, thread_num)
+        self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
+
+    @final
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+
+    @final
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num,
+        )
+
+    @final
+    cdef void _parallel_on_Y_init(
+        self,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_Y_init(self)
+        self.gemm_term_computer._parallel_on_Y_init()
+
+    @final
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+
+    @final
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num
+        )
+
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        if not self.use_squared_distances:
+            PairwiseDistancesRadiusNeighborhood{{bitness}}.compute_exact_distances(self)
+
+    @final
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t squared_dist_i_j
+            ITYPE_t n_X = X_end - X_start
+            ITYPE_t n_Y = Y_end - Y_start
+            DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks(
+                X_start, X_end, Y_start, Y_end, thread_num
+            )
+
+        # Pushing the distance and their associated indices in vectors.
+        for i in range(n_X):
+            for j in range(n_Y):
+                # Using the squared euclidean distance as the rank-preserving distance:
+                #
+                #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                #
+                squared_dist_i_j = (
+                    self.X_norm_squared[i + X_start]
+                    + dist_middle_terms[i * n_Y + j]
+                    + self.Y_norm_squared[j + Y_start]
+                )
+                if squared_dist_i_j <= self.r_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
 {{endfor}}
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 1e58d2e2609bb..48df97f31fef4 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -8,6 +8,7 @@
 from sklearn.metrics._pairwise_distances_reduction import (
     PairwiseDistancesReduction,
     PairwiseDistancesArgKmin,
+    PairwiseDistancesRadiusNeighborhood,
     _sqeuclidean_row_norms64,
     _sqeuclidean_row_norms32,
 )
@@ -31,11 +32,11 @@
 ]
 
 
-def _get_dummy_metric_params_list(metric: str, n_features: int):
+def _get_metric_params_list(metric: str, n_features: int, seed: int = 1):
     """Return list of dummy DistanceMetric kwargs for tests."""
 
     # Distinguishing on cases not to compute unneeded datastructures.
-    rng = np.random.RandomState(1)
+    rng = np.random.RandomState(seed)
 
     if metric == "minkowski":
         minkowski_kwargs = [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)]
@@ -135,13 +136,37 @@ def assert_argkmin_results_quasi_equality(
             skip_permutation_check = False
 
 
+def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices):
+    # We get arrays of arrays and we need to check for individual pairs
+    for i in range(ref_dist.shape[0]):
+        assert_array_equal(
+            ref_indices[i],
+            indices[i],
+            err_msg=f"Query vector #{i} has different neighbors' indices",
+        )
+        assert_allclose(
+            ref_dist[i],
+            dist[i],
+            err_msg=f"Query vector #{i} has different neighbors' distances",
+            rtol=1e-7,
+        )
+
+
 ASSERT_RESULT = {
     # In the case of 64bit, we test for exact equality.
     (PairwiseDistancesArgKmin, np.float64): assert_argkmin_results_equality,
+    (
+        PairwiseDistancesRadiusNeighborhood,
+        np.float64,
+    ): assert_radius_neighborhood_results_equality,
     # In the case of 32bit, indices can be permuted due to small difference
     # in the computations of their associated distances, hence we test equality of
     # results up to valid permutations.
     (PairwiseDistancesArgKmin, np.float32): assert_argkmin_results_quasi_equality,
+    (
+        PairwiseDistancesRadiusNeighborhood,
+        np.float32,
+    ): assert_radius_neighborhood_results_equality,
 }
 
 
@@ -178,8 +203,8 @@ def test_argkmin_factory_method_wrong_usages():
     metric = "euclidean"
 
     msg = (
-        "No implementation exist for fused-typed datasets pair. "
-        "Currently X.dtype=float32 and Y.dtype=float64."
+        "Datasets must both be of np.float64 or np.float32 dtype. Currently: "
+        "X.dtype=float32 and Y.dtype=float64."
     )
     with pytest.raises(ValueError, match=msg):
         PairwiseDistancesArgKmin.compute(
@@ -187,8 +212,8 @@ def test_argkmin_factory_method_wrong_usages():
         )
 
     msg = (
-        "No implementation exist for fused-typed datasets pair. "
-        "Currently X.dtype=float64 and Y.dtype=int32."
+        "Datasets must both be of np.float64 or np.float32 dtype. Currently: "
+        "X.dtype=float64 and Y.dtype=int32"
     )
     with pytest.raises(ValueError, match=msg):
         PairwiseDistancesArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
@@ -214,25 +239,99 @@ def test_argkmin_factory_method_wrong_usages():
             X=np.asfortranarray(X), Y=Y, k=k, metric=metric
         )
 
+    unused_metric_kwargs = {"p": 3}
+
+    message = (
+        r"Some metric_kwargs have been passed \({'p': 3}\) but aren't usable for this"
+        r" case \("
+        r"FastEuclideanPairwiseDistancesArgKmin."
+    )
+
+    with pytest.warns(UserWarning, match=message):
+        PairwiseDistancesArgKmin.compute(
+            X=X, Y=Y, k=k, metric=metric, metric_kwargs=unused_metric_kwargs
+        )
+
+
+def test_radius_neighborhood_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    radius = 5
+    metric = "euclidean"
+
+    with pytest.raises(
+        ValueError,
+        match=(
+            "Datasets must both be of np.float64 or np.float32 dtype. "
+            "Currently: X.dtype=float32 and Y.dtype=float64"
+        ),
+    ):
+        PairwiseDistancesRadiusNeighborhood.compute(
+            X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
+        )
+
+    with pytest.raises(
+        ValueError,
+        match=(
+            "Datasets must both be of np.float64 or np.float32 dtype. "
+            "Currently: X.dtype=float64 and Y.dtype=int32"
+        ),
+    ):
+        PairwiseDistancesRadiusNeighborhood.compute(
+            X=X, Y=Y.astype(np.int32), radius=radius, metric=metric
+        )
+
+    with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
+        PairwiseDistancesRadiusNeighborhood.compute(X=X, Y=Y, radius=-1, metric=metric)
+
+    with pytest.raises(ValueError, match="Unrecognized metric"):
+        PairwiseDistancesRadiusNeighborhood.compute(
+            X=X, Y=Y, radius=radius, metric="wrong metric"
+        )
+
+    with pytest.raises(
+        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
+    ):
+        PairwiseDistancesRadiusNeighborhood.compute(
+            X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
+        )
+
+    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
+        PairwiseDistancesRadiusNeighborhood.compute(
+            X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric
+        )
+
+    unused_metric_kwargs = {"p": 3}
+
+    message = (
+        r"Some metric_kwargs have been passed \({'p': 3}\) but aren't usable for this"
+        r" case \(FastEuclideanPairwiseDistancesRadiusNeighborhood"
+    )
+
+    with pytest.warns(UserWarning, match=message):
+        PairwiseDistancesRadiusNeighborhood.compute(
+            X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=unused_metric_kwargs
+        )
+
 
-@pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
 @pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
-    [PairwiseDistancesArgKmin],
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
 )
 def test_chunk_size_agnosticism(
+    global_random_seed,
     PairwiseDistancesReduction,
-    seed,
     n_samples,
     chunk_size,
     dtype,
     n_features=100,
 ):
     # Results should not depend on the chunk size
-    rng = np.random.RandomState(seed)
+    rng = np.random.RandomState(global_random_seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
@@ -248,6 +347,7 @@ def test_chunk_size_agnosticism(
         X,
         Y,
         parameter,
+        metric="manhattan",
         return_distance=True,
     )
 
@@ -256,30 +356,30 @@ def test_chunk_size_agnosticism(
         Y,
         parameter,
         chunk_size=chunk_size,
+        metric="manhattan",
         return_distance=True,
     )
 
-    ASSERT_RESULT[(PairwiseDistancesArgKmin, dtype)](
+    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](
         ref_dist, dist, ref_indices, indices
     )
 
 
-@pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [100, 1000])
+@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
-    [PairwiseDistancesArgKmin],
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
 )
 def test_n_threads_agnosticism(
+    global_random_seed,
+    dtype,
     PairwiseDistancesReduction,
-    seed,
     n_samples,
-    chunk_size,
-    dtype,
     n_features=100,
 ):
     # Results should not depend on the number of threads
-    rng = np.random.RandomState(seed)
+    rng = np.random.RandomState(global_random_seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
@@ -303,7 +403,7 @@ def test_n_threads_agnosticism(
             X, Y, parameter, return_distance=True
         )
 
-    ASSERT_RESULT[(PairwiseDistancesArgKmin, dtype)](
+    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](
         ref_dist, dist, ref_indices, indices
     )
 
@@ -349,31 +449,30 @@ def test_dtype_agnosticism(
     # We check results against np.float32 because we inherently
     # loose the information from np.float64.
     dist = dist.astype(ref_dist.dtype)
-    ASSERT_RESULT[(PairwiseDistancesArgKmin, np.float32)](
+    ASSERT_RESULT[(PairwiseDistancesReduction, np.float32)](
         ref_dist, dist, ref_indices, indices
     )
 
 
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
-@pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
 @pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
-    [PairwiseDistancesArgKmin],
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
 )
 def test_strategies_consistency(
+    global_random_seed,
     PairwiseDistancesReduction,
     metric,
     n_samples,
-    seed,
     dtype,
     n_features=10,
 ):
 
-    rng = np.random.RandomState(seed)
+    rng = np.random.RandomState(global_random_seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples, n_features).astype(dtype) * spread
@@ -396,7 +495,9 @@ def test_strategies_consistency(
         parameter,
         metric=metric,
         # Taking the first
-        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
+        metric_kwargs=_get_metric_params_list(
+            metric, n_features, seed=global_random_seed
+        )[0],
         # To be sure to use parallelization
         chunk_size=n_samples // 4,
         strategy="parallel_on_X",
@@ -409,7 +510,9 @@ def test_strategies_consistency(
         parameter,
         metric=metric,
         # Taking the first
-        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
+        metric_kwargs=_get_metric_params_list(
+            metric, n_features, seed=global_random_seed
+        )[0],
         # To be sure to use parallelization
         chunk_size=n_samples // 4,
         strategy="parallel_on_Y",
@@ -424,7 +527,7 @@ def test_strategies_consistency(
     )
 
 
-# Concrete PairwiseDistancesReductions tests
+# "Concrete PairwiseDistancesReductions"-specific tests
 
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@@ -434,6 +537,7 @@ def test_strategies_consistency(
 @pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
 def test_pairwise_distances_argkmin(
+    global_random_seed,
     n_features,
     translation,
     metric,
@@ -442,7 +546,7 @@ def test_pairwise_distances_argkmin(
     n_samples=100,
     k=10,
 ):
-    rng = np.random.RandomState(0)
+    rng = np.random.RandomState(global_random_seed)
     spread = 1000
     X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
     Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
@@ -452,7 +556,7 @@ def test_pairwise_distances_argkmin(
         X = np.ascontiguousarray(X[:, :2])
         Y = np.ascontiguousarray(Y[:, :2])
 
-    metric_kwargs = _get_dummy_metric_params_list(metric, n_features)[0]
+    metric_kwargs = _get_metric_params_list(metric, n_features)[0]
 
     # Reference for argkmin results
     if metric == "euclidean":
@@ -489,17 +593,83 @@ def test_pairwise_distances_argkmin(
     )
 
 
-@pytest.mark.parametrize("seed", range(10))
+# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
+@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
+@pytest.mark.parametrize("n_features", [50, 500])
+@pytest.mark.parametrize("translation", [0, 1e6])
+@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
+@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
+def test_pairwise_distances_radius_neighbors(
+    global_random_seed,
+    n_features,
+    translation,
+    metric,
+    strategy,
+    n_samples=100,
+    dtype=np.float64,
+):
+    rng = np.random.RandomState(global_random_seed)
+    spread = 1000
+    radius = spread * np.log(n_features)
+    X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    metric_kwargs = _get_metric_params_list(
+        metric, n_features, seed=global_random_seed
+    )[0]
+
+    # Reference for argkmin results
+    if metric == "euclidean":
+        # Compare to scikit-learn GEMM optimized implementation
+        dist_matrix = euclidean_distances(X, Y)
+    else:
+        dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
+
+    # Getting the neighbors for a given radius
+    neigh_indices_ref = []
+    neigh_distances_ref = []
+
+    for row in dist_matrix:
+        ind = np.arange(row.shape[0])[row <= radius]
+        dist = row[ind]
+
+        sort = np.argsort(dist)
+        ind, dist = ind[sort], dist[sort]
+
+        neigh_indices_ref.append(ind)
+        neigh_distances_ref.append(dist)
+
+    neigh_indices_ref = np.array(neigh_indices_ref)
+    neigh_distances_ref = np.array(neigh_distances_ref)
+
+    neigh_distances, neigh_indices = PairwiseDistancesRadiusNeighborhood.compute(
+        X,
+        Y,
+        radius,
+        metric=metric,
+        metric_kwargs=metric_kwargs,
+        return_distance=True,
+        # So as to have more than a chunk, forcing parallelism.
+        chunk_size=n_samples // 4,
+        strategy=strategy,
+        sort_results=True,
+    )
+
+    ASSERT_RESULT[PairwiseDistancesRadiusNeighborhood](
+        neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref
+    )
+
+
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("num_threads", [1, 2, 8])
 def test_sqeuclidean_row_norms(
-    seed,
+    global_random_seed,
     n_samples,
     n_features,
     num_threads,
 ):
-    rng = np.random.RandomState(seed)
+    rng = np.random.RandomState(global_random_seed)
     spread = 100
     X64 = rng.rand(n_samples, n_features).astype(np.float64) * spread
     X32 = X64.astype(np.float32)

From 26b3839a5bab615793f2036c2f4bf734e0b5f7f5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 14 Apr 2022 13:55:11 +0200
Subject: [PATCH 22/26] MAINT Correctly allocate buffer for upcasting

---
 .../_pairwise_distances_reduction.pyx.tp      | 26 +++++++------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index e93d7a2208be1..9bdc3155d9ce7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -46,6 +46,7 @@ import warnings
 
 from .. import get_config
 from libc.stdlib cimport free, malloc
+from libc.stdio cimport printf
 from libc.float cimport DBL_MAX
 from libcpp.memory cimport shared_ptr, make_shared
 from libcpp.vector cimport vector
@@ -627,8 +628,8 @@ cdef class GEMMTermComputer{{bitness}}:
         vector[vector[DTYPE_t]] dist_middle_terms_chunks
 
 {{if need_upcast}}
-        DTYPE_t** X_c_upcast
-        DTYPE_t** Y_c_upcast
+        vector[vector[DTYPE_t]] X_c_upcast
+        vector[vector[DTYPE_t]] Y_c_upcast
 {{endif}}
 
     def __init__(self,
@@ -649,20 +650,13 @@ cdef class GEMMTermComputer{{bitness}}:
         self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
 
 {{if need_upcast}}
+        self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
+        self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
+
         # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
         for thread_num in range(self.effective_n_threads):
-            self.X_c_upcast[thread_num] = <DTYPE_t *> malloc(
-                self.dist_middle_terms_chunks_size * sizeof(DTYPE_t)
-            )
-            self.Y_c_upcast[thread_num] = <DTYPE_t *> malloc(
-                self.dist_middle_terms_chunks_size * sizeof(DTYPE_t)
-            )
-
-    def __dealloc_(self):
-        if self.X_c_upcast is not NULL:
-            free(self.X_c_upcast)
-        if self.Y_c_upcast is not NULL:
-            free(self.Y_c_upcast)
+            self.X_c_upcast[thread_num].resize(self.dist_middle_terms_chunks_size)
+            self.Y_c_upcast[thread_num].resize(self.dist_middle_terms_chunks_size)
 {{endif}}
 
 
@@ -783,8 +777,8 @@ cdef class GEMMTermComputer{{bitness}}:
             ITYPE_t K = X_c.shape[1]
             DTYPE_t alpha = - 2.
 {{if need_upcast}}
-            DTYPE_t * A = self.X_c_upcast[thread_num]
-            DTYPE_t * B = self.Y_c_upcast[thread_num]
+            DTYPE_t * A = self.X_c_upcast[thread_num].data()
+            DTYPE_t * B = self.Y_c_upcast[thread_num].data()
 {{else}}
             # Casting for A and B to remove the const is needed because APIs exposed via
             # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.

From 7b0bcd3664f3960f882a2ef1d196ff2f8fb18eca Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Sun, 29 May 2022 10:17:33 +0200
Subject: [PATCH 23/26] TST Update tests

---
 .../test_pairwise_distances_reduction.py      | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 48df97f31fef4..7bfd89dc06800 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -100,10 +100,14 @@ def assert_argkmin_results_quasi_equality(
     for i in range(n - 1):
         # We test the equality of pair of adjacent indices and distances
         # of the references against the results.
-        rd_current, rd_next = ref_dist[i], ref_dist[i + 1]
-        d_current, d_next = dist[i], dist[i + 1]
-        ri_current, ri_next = ref_indices[i], ref_indices[i + 1]
-        i_current, i_next = indices[i], indices[i + 1]
+        rd_prev, rd_current, rd_next = ref_dist[i - 1], ref_dist[i], ref_dist[i + 1]
+        d_prev, d_current, d_next = dist[i - 1], dist[i], dist[i + 1]
+        ri_prev, ri_current, ri_next = (
+            ref_indices[i - 1],
+            ref_indices[i],
+            ref_indices[i + 1],
+        )
+        i_prev, i_current, i_next = indices[i - 1], indices[i], indices[i + 1]
 
         assert np.isclose(
             d_current, rd_current, rtol=rtol
@@ -121,10 +125,10 @@ def assert_argkmin_results_quasi_equality(
             )
             assert skip_permutation_check or valid_permutation, (
                 "Query vectors have different neighbors' indices \n"
-                f"(i_current, i_next) = {i_current, i_next} \n"
-                f"(ri_current, ri_next) = {ri_current, ri_next} \n"
-                f"(d_current, d_next) = {d_current, d_next} \n"
-                f"(rd_current, rd_next) = {rd_current, rd_next} \n"
+                f"(i_prev, i_current, i_next) = {i_prev, i_current, i_next} \n"
+                f"(ri_prev, ri_current, ri_next) = {ri_prev, ri_current, ri_next} \n"
+                f"(d_prev, d_current, d_next) = {d_prev, d_current, d_next} \n"
+                f"(rd_prev, rd_current, rd_next) = {rd_prev, rd_current, rd_next} \n"
             )
             # If there's a permutation at this iteration, we need to
             # skip the following permutation check.
@@ -565,7 +569,7 @@ def test_pairwise_distances_argkmin(
     else:
         dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
     # Taking argkmin (indices of the k smallest values)
-    argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
+    argkmin_indices_ref = np.argsort(dist_matrix, kind="mergesort", axis=1)[:, :k]
     # Getting the associated distances
     argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64)
     for row_idx in range(argkmin_indices_ref.shape[0]):
@@ -655,7 +659,7 @@ def test_pairwise_distances_radius_neighbors(
         sort_results=True,
     )
 
-    ASSERT_RESULT[PairwiseDistancesRadiusNeighborhood](
+    ASSERT_RESULT[(PairwiseDistancesRadiusNeighborhood, dtype)](
         neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref
     )
 

From f0fc839adcc10f8640a487ed84526578a1e1d24c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Sun, 29 May 2022 10:24:56 +0200
Subject: [PATCH 24/26] MAINT Correctly resize buffers for upcasting

---
 .../_pairwise_distances_reduction.pyx.tp       | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index 90ea78c11305b..ce2dec47d10d3 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -622,11 +622,13 @@ cdef class GEMMTermComputer{{bitness}}:
         ITYPE_t chunks_n_threads
         ITYPE_t dist_middle_terms_chunks_size
         ITYPE_t n_features
+        ITYPE_t chunk_size
 
         # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
         vector[vector[DTYPE_t]] dist_middle_terms_chunks
 
 {{if need_upcast}}
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
         vector[vector[DTYPE_t]] X_c_upcast
         vector[vector[DTYPE_t]] Y_c_upcast
 {{endif}}
@@ -638,6 +640,7 @@ cdef class GEMMTermComputer{{bitness}}:
         ITYPE_t chunks_n_threads,
         ITYPE_t dist_middle_terms_chunks_size,
         ITYPE_t n_features,
+        ITYPE_t chunk_size,
     ):
         self.X = X
         self.Y = Y
@@ -645,17 +648,20 @@ cdef class GEMMTermComputer{{bitness}}:
         self.chunks_n_threads = chunks_n_threads
         self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
         self.n_features = n_features
+        self.chunk_size = chunk_size
 
         self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
 
 {{if need_upcast}}
+        # We populate the buffer for upcasting chunks of X and Y from 32bit to 64bit.
         self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
         self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
 
-        # Buffers for upcasting chunks of X and Y from 32bit to 64bit.
+        upcast_buffer_n_elements = self.chunk_size * n_features
+
         for thread_num in range(self.effective_n_threads):
-            self.X_c_upcast[thread_num].resize(self.dist_middle_terms_chunks_size)
-            self.Y_c_upcast[thread_num].resize(self.dist_middle_terms_chunks_size)
+            self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+            self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
 {{endif}}
 
 
@@ -1556,7 +1562,8 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
             self.effective_n_threads,
             self.chunks_n_threads,
             dist_middle_terms_chunks_size,
-            n_features=datasets_pair.X.shape[1]
+            n_features=datasets_pair.X.shape[1],
+            chunk_size=self.chunk_size,
         )
 
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
@@ -2171,7 +2178,8 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood{{bitness}}(PairwiseD
             self.effective_n_threads,
             self.chunks_n_threads,
             dist_middle_terms_chunks_size,
-            n_features=datasets_pair.X.shape[1]
+            n_features=datasets_pair.X.shape[1],
+            chunk_size=self.chunk_size,
         )
 
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:

From cef57b155b3d525c6053a31cafcc53c0496a0432 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 1 Jun 2022 16:24:52 +0200
Subject: [PATCH 25/26] MAINT Document and reduce diff but not the logic

---
 sklearn/metrics/_dist_metrics.pyx.tp          |  22 +-
 .../_pairwise_distances_reduction.pyx.tp      | 439 +++++++++---------
 2 files changed, 249 insertions(+), 212 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp
index b67db59fe98b8..71644b251c42c 100644
--- a/sklearn/metrics/_dist_metrics.pyx.tp
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -33,7 +33,6 @@ implementation_specific_values = [
 import numpy as np
 cimport numpy as cnp
 from cython cimport final
-from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
 
 cnp.import_array()  # required in order to use C-API
 
@@ -43,6 +42,9 @@ cdef extern from "arrayobject.h":
     object PyArray_SimpleNewFromData(int nd, cnp.npy_intp* dims,
                                      int typenum, void* data)
 
+
+from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
+
 from scipy.sparse import csr_matrix, issparse
 from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DTYPECODE
 from ..utils._typedefs import DTYPE, ITYPE
@@ -468,6 +470,7 @@ cdef class DistanceMetric{{name_suffix}}:
 cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Euclidean Distance metric
 
+    .. math::
        D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 }
     """
     def __init__(self):
@@ -500,6 +503,7 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Standardized Euclidean Distance metric
 
+    .. math::
        D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} }
     """
     def __init__(self, V):
@@ -543,6 +547,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Manhattan/City-block Distance metric
 
+    .. math::
        D(x, y) = \sum_i |x_i - y_i|
     """
     def __init__(self):
@@ -563,6 +568,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     """Chebyshev/Infinity Distance
 
+    .. math::
        D(x, y) = max_i (|x_i - y_i|)
 
     Examples
@@ -594,12 +600,14 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Minkowski Distance
 
+    .. math::
         D(x, y) = {||u-v||}_p
 
     when w is None.
 
     Here is the more general expanded expression for the weighted case:
 
+    .. math::
         D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p)
 
     Parameters
@@ -676,6 +684,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     r"""Weighted Minkowski Distance
 
+    .. math::
        D(x, y) = [\sum_i |w_i * (x_i - y_i)|^p] ^ (1/p)
 
     Weighted Minkowski Distance requires p >= 1 and finite.
@@ -718,7 +727,6 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 
         cdef DTYPE_t d = 0
         cdef cnp.intp_t j
-
         for j in range(size):
             d += <DTYPE_t> (pow(self.vec[j] * fabs(x1[j] - x2[j]), self.p))
         return d
@@ -746,6 +754,7 @@ cdef class WMinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
 cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     """Mahalanobis Distance
 
+    .. math::
        D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) }
 
     Parameters
@@ -819,6 +828,7 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     Hamming distance is meant for discrete-valued vectors, though it is
     a valid metric for real-valued vectors.
 
+    .. math::
        D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
     """
     cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
@@ -840,6 +850,7 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     Canberra distance is meant for discrete-valued vectors, though it is
     a valid metric for real-valued vectors.
 
+    .. math::
        D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
     """
     cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
@@ -862,6 +873,7 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     Bray-Curtis distance is meant for discrete-valued vectors, though it is
     a valid metric for real-valued vectors.
 
+    .. math::
        D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
     """
     cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
@@ -940,6 +952,7 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     be treated as False.
 
         D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT)
+
     """
     cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
@@ -964,6 +977,7 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     be treated as False.
 
         D(x, y) = 1 - N_TT / (N + N_TF + N_FT)
+
     """
     cdef inline DTYPE_t dist(self, const {{DTYPE_t}}* x1, const {{DTYPE_t}}* x2,
                              ITYPE_t size) nogil except -1:
@@ -1082,7 +1096,8 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
     to be the latitude, the second is the longitude, given in radians.
     The dimension of the points must be 2:
 
-        D(x, y) = 2 arcsin[sqrt{sin^2((x1 - y1) / 2) + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}]
+    D(x, y) = 2 arcsin[sqrt{sin^2((x1 - y1) / 2) + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}]
+
     """
 
     def _validate_data(self, X):
@@ -1146,7 +1161,6 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
         cdef cnp.ndarray x2arr
         x1arr = _buffer_to_ndarray{{name_suffix}}(x1, size)
         x2arr = _buffer_to_ndarray{{name_suffix}}(x2, size)
-
         d = self.func(x1arr, x2arr, **self.kwargs)
         try:
             # Cython generates code here that results in a TypeError
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index ce2dec47d10d3..6ce8c58952833 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -46,7 +46,6 @@ import warnings
 
 from .. import get_config
 from libc.stdlib cimport free, malloc
-from libc.stdio cimport printf
 from libc.float cimport DBL_MAX
 from libcpp.memory cimport shared_ptr, make_shared
 from libcpp.vector cimport vector
@@ -141,7 +140,19 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
     return squared_row_norms
 
 #####################
+# Interfaces:
+#   Those interfaces are meant to be used in the Python code, decoupling the
+#   actual implementation from the Python code. This allows changing all the
+#   private implementation while maintaining a contract for the Python callers.
+#
+#   Each interface extending the base `PairwiseDistancesReduction` interface must
+#   implement the :meth:`compute` classmethod.
+#
+#   Under the hood, such a function must only define the logic to dispatch
+#   at runtime to the correct dtype-specialized `PairwiseDistancesReduction`
+#   implementation based on the dtype of X and of Y.
 
+# Base interface
 cdef class PairwiseDistancesReduction:
     """Abstract base class for pairwise distance computation & reduction.
 
@@ -225,7 +236,6 @@ cdef class PairwiseDistancesReduction:
     def valid_dtypes(cls):
         return (np.float32, np.float64)
 
-
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
         """Return True if the PairwiseDistancesReduction can be used for the given parameters.
@@ -593,215 +603,15 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms32(
 
     return squared_row_norms
 
-# dtype-specific implementations.
+#####################
+# dtype-specific implementations:
+#   For each dtype, an implementation of `PairwiseDistancesReductions` are generated by Tempita.
+#   Computations are dispatched to them at runtime via the interfaces defined above.
 
 {{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}}
 
 from ._dist_metrics cimport DatasetsPair{{distance_suffix}}, DenseDenseDatasetsPair{{distance_suffix}}
 
-
-cdef class GEMMTermComputer{{bitness}}:
-    """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM.
-
-    `FastEuclidean*` classes internally compute the squared Euclidean distances between
-    chunks of vectors X_c and Y_c using the following decomposition:
-
-
-                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-
-
-    This helper class is in charge of wrapping the common logic to compute
-    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
-    arithmetic intensity.
-    """
-    cdef:
-        const {{DTYPE_t}}[:, ::1] X
-        const {{DTYPE_t}}[:, ::1] Y
-
-        ITYPE_t effective_n_threads
-        ITYPE_t chunks_n_threads
-        ITYPE_t dist_middle_terms_chunks_size
-        ITYPE_t n_features
-        ITYPE_t chunk_size
-
-        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
-        vector[vector[DTYPE_t]] dist_middle_terms_chunks
-
-{{if need_upcast}}
-        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
-        vector[vector[DTYPE_t]] X_c_upcast
-        vector[vector[DTYPE_t]] Y_c_upcast
-{{endif}}
-
-    def __init__(self,
-        {{DTYPE_t}}[:, ::1] X,
-        {{DTYPE_t}}[:, ::1] Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        self.X = X
-        self.Y = Y
-        self.effective_n_threads = effective_n_threads
-        self.chunks_n_threads = chunks_n_threads
-        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
-        self.n_features = n_features
-        self.chunk_size = chunk_size
-
-        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
-
-{{if need_upcast}}
-        # We populate the buffer for upcasting chunks of X and Y from 32bit to 64bit.
-        self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
-        self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
-
-        upcast_buffer_n_elements = self.chunk_size * n_features
-
-        for thread_num in range(self.effective_n_threads):
-            self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
-            self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
-{{endif}}
-
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-{{if need_upcast}}
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = Y_end - Y_start
-
-        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
-{{else}}
-        return
-{{endif}}
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
-        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-{{if need_upcast}}
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = X_end - X_start
-
-        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
-{{else}}
-        return
-{{endif}}
-
-    cdef void _parallel_on_Y_init(self) nogil:
-        for thread_num in range(self.chunks_n_threads):
-            self.dist_middle_terms_chunks[thread_num].resize(
-                self.dist_middle_terms_chunks_size
-            )
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-{{if need_upcast}}
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = X_end - X_start
-
-        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
-{{else}}
-        return
-{{endif}}
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil:
-{{if need_upcast}}
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = Y_end - Y_start
-
-        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
-{{else}}
-        return
-{{endif}}
-
-    cdef DTYPE_t * _compute_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t squared_dist_i_j
-            const {{DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :]
-            const {{DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
-
-            # Careful: LDA, LDB and LDC are given for F-ordered arrays
-            # in BLAS documentations, for instance:
-            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
-            #
-            # Here, we use their counterpart values to work with C-ordered arrays.
-            BLAS_Order order = RowMajor
-            BLAS_Trans ta = NoTrans
-            BLAS_Trans tb = Trans
-            ITYPE_t m = X_c.shape[0]
-            ITYPE_t n = Y_c.shape[0]
-            ITYPE_t K = X_c.shape[1]
-            DTYPE_t alpha = - 2.
-{{if need_upcast}}
-            DTYPE_t * A = self.X_c_upcast[thread_num].data()
-            DTYPE_t * B = self.Y_c_upcast[thread_num].data()
-{{else}}
-            # Casting for A and B to remove the const is needed because APIs exposed via
-            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
-            # See: https://github.com/scipy/scipy/issues/14262
-            DTYPE_t * A = <DTYPE_t *> &X_c[0, 0]
-            DTYPE_t * B = <DTYPE_t *> &Y_c[0, 0]
-{{endif}}
-            ITYPE_t lda = X_c.shape[1]
-            ITYPE_t ldb = X_c.shape[1]
-            DTYPE_t beta = 0.
-            ITYPE_t ldc = Y_c.shape[0]
-
-        # dist_middle_terms = `-2 * X_c @ Y_c.T`
-        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
-
-        return dist_middle_terms
-
-
 cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
     """{{bitness}}bit implementation of PairwiseDistancesReduction."""
 
@@ -929,6 +739,7 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
                     X_end = X_start + self.X_n_samples_chunk
 
                 # Reinitializing thread datastructures for the new X chunk
+                # Eventually upcast X[X_start:X_end] to 64bit
                 self._parallel_on_X_init_chunk(thread_num, X_start, X_end)
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
@@ -938,6 +749,7 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
+                    # Eventually upcast Y[Y_start:Y_end] to 64bit
                     self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
                         X_start, X_end,
                         Y_start, Y_end,
@@ -996,6 +808,7 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
                 thread_num = _openmp_thread_num()
 
                 # Initializing datastructures used in this thread
+                # Eventually upcast X[X_start:X_end] to 64bit
                 self._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
@@ -1005,6 +818,7 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
+                    # Eventually upcast Y[Y_start:Y_end] to 64bit
                     self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
                         X_start, X_end,
                         Y_start, Y_end,
@@ -1086,7 +900,10 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
         ITYPE_t Y_end,
         ITYPE_t thread_num,
     ) nogil:
-        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks."""
+        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks.
+
+        This is eventually used to upcast X[X_start:X_end] to 64bit.
+        """
         return
 
     cdef void _parallel_on_X_prange_iter_finalize(
@@ -1128,7 +945,10 @@ cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
         ITYPE_t Y_end,
         ITYPE_t thread_num,
     ) nogil:
-        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks."""
+        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks.
+
+        This is eventually used to upcast Y[Y_start:Y_end] to 64bit.
+        """
         return
 
     cdef void _parallel_on_Y_synchronize(
@@ -1505,6 +1325,208 @@ cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitne
         return np.asarray(self.argkmin_indices)
 
 
+cdef class GEMMTermComputer{{bitness}}:
+    """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM.
+
+    `FastEuclidean*` classes internally compute the squared Euclidean distances between
+    chunks of vectors X_c and Y_c using the following decomposition:
+
+
+                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+
+
+    This helper class is in charge of wrapping the common logic to compute
+    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
+    arithmetic intensity.
+    """
+    cdef:
+        const {{DTYPE_t}}[:, ::1] X
+        const {{DTYPE_t}}[:, ::1] Y
+
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+        ITYPE_t dist_middle_terms_chunks_size
+        ITYPE_t n_features
+        ITYPE_t chunk_size
+
+        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
+        vector[vector[DTYPE_t]] dist_middle_terms_chunks
+
+{{if need_upcast}}
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
+        vector[vector[DTYPE_t]] X_c_upcast
+        vector[vector[DTYPE_t]] Y_c_upcast
+{{endif}}
+
+    def __init__(self,
+        {{DTYPE_t}}[:, ::1] X,
+        {{DTYPE_t}}[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        self.X = X
+        self.Y = Y
+        self.effective_n_threads = effective_n_threads
+        self.chunks_n_threads = chunks_n_threads
+        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
+        self.n_features = n_features
+        self.chunk_size = chunk_size
+
+        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
+
+{{if need_upcast}}
+        # We populate the buffer for upcasting chunks of X and Y from 32bit to 64bit.
+        self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
+        self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
+
+        upcast_buffer_n_elements = self.chunk_size * n_features
+
+        for thread_num in range(self.effective_n_threads):
+            self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+            self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+{{endif}}
+
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
+        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef void _parallel_on_Y_init(self) nogil:
+        for thread_num in range(self.chunks_n_threads):
+            self.dist_middle_terms_chunks[thread_num].resize(
+                self.dist_middle_terms_chunks_size
+            )
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef DTYPE_t * _compute_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t squared_dist_i_j
+            const {{DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :]
+            const {{DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
+
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+{{if need_upcast}}
+            DTYPE_t * A = self.X_c_upcast[thread_num].data()
+            DTYPE_t * B = self.Y_c_upcast[thread_num].data()
+{{else}}
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            # See: https://github.com/scipy/scipy/issues/14262
+            DTYPE_t * A = <DTYPE_t *> &X_c[0, 0]
+            DTYPE_t * B = <DTYPE_t *> &Y_c[0, 0]
+{{endif}}
+            ITYPE_t lda = X_c.shape[1]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            ITYPE_t ldc = Y_c.shape[0]
+
+        # dist_middle_terms = `-2 * X_c @ Y_c.T`
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
+
+        return dist_middle_terms
+
+
 cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArgKmin{{bitness}}):
     """Fast specialized alternative for PairwiseDistancesArgKmin{{bitness}} on EuclideanDistance."""
     cdef:
@@ -1683,6 +1705,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArg
             DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t * heaps_indices = self.heaps_indices_chunks[thread_num]
 
+
         # Pushing the distance and their associated indices on heaps
         # which keep tracks of the argkmin.
         for i in range(n_X):

From cbef7f1e15dcca397cc72959f15f6dded0989265 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 7 Jun 2022 18:15:42 +0200
Subject: [PATCH 26/26] DEBUG Propagate sort_results

---
 sklearn/metrics/_pairwise_distances_reduction.pyx.tp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index 6ce8c58952833..8193832f1b494 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -533,6 +533,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
                 chunk_size=chunk_size,
                 metric_kwargs=metric_kwargs,
                 strategy=strategy,
+                sort_results=sort_results,
                 return_distance=return_distance,
             )
 {{endfor}}