diff --git a/.gitignore b/.gitignore
index f4125316d7d41..967ce97dc38ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,3 +87,4 @@ sklearn/utils/_weight_vector.pxd
 sklearn/linear_model/_sag_fast.pyx
 sklearn/metrics/_dist_metrics.pyx
 sklearn/metrics/_dist_metrics.pxd
+sklearn/metrics/_pairwise_distances_reduction.pyx
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 4c46c0d631f76..8b5e106e5206a 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -186,6 +186,9 @@ Changelog
 
 
 - |Efficiency| Low-level routines for reductions on pairwise distances
+  for dense float32 and float64 datasets have been refactored.
+  The following functions and estimators now benefit from improved performances,
+  in particular on multi-cores machines:
   for dense float64 datasets have been refactored. The following functions
   and estimators now benefit from improved performances in terms of hardware
   scalability and speed-ups:
diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp
index 32ba546672c6e..8fb2f602dc086 100644
--- a/sklearn/metrics/_dist_metrics.pxd.tp
+++ b/sklearn/metrics/_dist_metrics.pxd.tp
@@ -100,12 +100,11 @@ cdef class DistanceMetric{{name_suffix}}:
 
     cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1
 
-{{endfor}}
 
 ######################################################################
 # DatasetsPair base class
-cdef class DatasetsPair:
-    cdef DistanceMetric distance_metric
+cdef class DatasetsPair{{name_suffix}}:
+    cdef DistanceMetric{{name_suffix}} distance_metric
 
     cdef ITYPE_t n_samples_X(self) nogil
 
@@ -116,8 +115,10 @@ cdef class DatasetsPair:
     cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
 
 
-cdef class DenseDenseDatasetsPair(DatasetsPair):
+cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
     cdef:
-        const DTYPE_t[:, ::1] X
-        const DTYPE_t[:, ::1] Y
+        const {{DTYPE_t}}[:, ::1] X
+        const {{DTYPE_t}}[:, ::1] Y
         ITYPE_t d
+
+{{endfor}}
diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp
index 5986fa939b45d..71644b251c42c 100644
--- a/sklearn/metrics/_dist_metrics.pyx.tp
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -1170,11 +1170,10 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
             raise TypeError("Custom distance function must accept two "
                             "vectors and return a float.")
 
-{{endfor}}
 
 ######################################################################
 # Datasets Pair Classes
-cdef class DatasetsPair:
+cdef class DatasetsPair{{name_suffix}}:
     """Abstract class which wraps a pair of datasets (X, Y).
 
     This class allows computing distances between a single pair of rows of
@@ -1211,7 +1210,7 @@ cdef class DatasetsPair:
         Y,
         str metric="euclidean",
         dict metric_kwargs=None,
-    ) -> DatasetsPair:
+    ) -> DatasetsPair{{name_suffix}}:
         """Return the DatasetsPair implementation for the given arguments.
 
         Parameters
@@ -1241,14 +1240,14 @@ cdef class DatasetsPair:
             The suited DatasetsPair implementation.
         """
         cdef:
-            DistanceMetric distance_metric = DistanceMetric.get_metric(
+            DistanceMetric{{name_suffix}} distance_metric = DistanceMetric{{name_suffix}}.get_metric(
                 metric,
                 **(metric_kwargs or {})
             )
 
-        if not(X.dtype == Y.dtype == np.float64):
+        if not(X.dtype == Y.dtype and X.dtype in DatasetsPair{{name_suffix}}.valid_dtypes()):
             raise ValueError(
-                f"Only 64bit float datasets are supported at this time, "
+                f"Only np.float64 and np.float32 datasets are supported at this time, "
                 f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
             )
 
@@ -1260,11 +1259,15 @@ cdef class DatasetsPair:
         if issparse(X) or issparse(Y):
             raise ValueError("Only dense datasets are supported for X and Y.")
 
-        return DenseDenseDatasetsPair(X, Y, distance_metric)
+        return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric)
 
-    def __init__(self, DistanceMetric distance_metric):
+    def __init__(self, DistanceMetric{{name_suffix}} distance_metric):
         self.distance_metric = distance_metric
 
+    @classmethod
+    def valid_dtypes(cls):
+        return (np.float64, np.float32)
+
     cdef ITYPE_t n_samples_X(self) nogil:
         """Number of samples in X."""
         # This is a abstract method.
@@ -1289,7 +1292,7 @@ cdef class DatasetsPair:
         return -1
 
 @final
-cdef class DenseDenseDatasetsPair(DatasetsPair):
+cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
     """Compute distances between row vectors of two arrays.
 
     Parameters
@@ -1305,7 +1308,7 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         between two row vectors of (X, Y).
     """
 
-    def __init__(self, X, Y, DistanceMetric distance_metric):
+    def __init__(self, X, Y, DistanceMetric{{name_suffix}} distance_metric):
         super().__init__(distance_metric)
         # Arrays have already been checked
         self.X = X
@@ -1331,3 +1334,5 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
         return self.distance_metric.dist(&self.X[i, 0],
                                          &self.Y[j, 0],
                                          self.d)
+
+{{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
similarity index 69%
rename from sklearn/metrics/_pairwise_distances_reduction.pyx
rename to sklearn/metrics/_pairwise_distances_reduction.pyx.tp
index 9191efae2a8da..8193832f1b494 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp
@@ -1,3 +1,30 @@
+{{py:
+
+implementation_specific_values = [
+    # Values are the following ones:
+    #
+    #       name_suffix, bitness, DTYPE_t, DTYPE, need_upcast
+    #
+    # On the first hand, an empty string is use for `name_suffix`
+    # for the 64bit case as to still be able to expose the original
+    # 64bit implementation under the same API, namely `DistanceMetric`.
+    #
+    # On the other hand, '32' bit is use for `name_suffix`
+    # for the 32bit case to remove ambiguity and use `DistanceMetric32`,
+    # which is not publicly exposed.
+    #
+    # The metric mapping is adapted accordingly to route to the correct
+    # implementations.
+    #
+    # We also use 64bit types as defined in `sklearn.utils._typedefs`
+    # to maintain backward compatibility as the symbol level for extra
+    # safety.
+    #
+    ('', '64', 'DTYPE_t', 'DTYPE', False),
+    ('32', '32', 'cnp.float32_t', 'np.float32', True)
+]
+
+}}
 # Pairwise Distances Reductions
 # =============================
 #
@@ -25,9 +52,7 @@ from libcpp.vector cimport vector
 from cython cimport final
 from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
-from cpython.ref cimport Py_INCREF
 
-from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair
 from ..utils._cython_blas cimport (
   BLAS_Order,
   BLAS_Trans,
@@ -53,7 +78,6 @@ from ..utils.fixes import threadpool_limits
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
-
 cnp.import_array()
 
 # TODO: change for `libcpp.algorithm.move` once Cython 3 is used
@@ -82,8 +106,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
     """Coerce a std::vector of std::vector to a ndarray of ndarray."""
     cdef:
         ITYPE_t n = deref(vecs).size()
-        cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n,
-                                                                      dtype=np.ndarray)
+        cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray)
 
     for i in range(n):
         nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
@@ -117,7 +140,19 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
     return squared_row_norms
 
 #####################
+# Interfaces:
+#   Those interfaces are meant to be used in the Python code, decoupling the
+#   actual implementation from the Python code. This allows changing all the
+#   private implementation while maintaining a contract for the Python callers.
+#
+#   Each interface extending the base `PairwiseDistancesReduction` interface must
+#   implement the :meth:`compute` classmethod.
+#
+#   Under the hood, such a function must only define the logic to dispatch
+#   at runtime to the correct dtype-specialized `PairwiseDistancesReduction`
+#   implementation based on the dtype of X and of Y.
 
+# Base interface
 cdef class PairwiseDistancesReduction:
     """Abstract base class for pairwise distance computation & reduction.
 
@@ -183,32 +218,6 @@ cdef class PairwiseDistancesReduction:
           `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
     """
 
-    cdef:
-        readonly DatasetsPair datasets_pair
-
-        # The number of threads that can be used is stored in effective_n_threads.
-        #
-        # The number of threads to use in the parallelisation strategy
-        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
-        # for small datasets, less threads might be needed to loop over pair of chunks.
-        #
-        # Hence the number of threads that _will_ be used for looping over chunks
-        # is stored in chunks_n_threads, allowing solely using what we need.
-        #
-        # Thus, an invariant is:
-        #
-        #                 chunks_n_threads <= effective_n_threads
-        #
-        ITYPE_t effective_n_threads
-        ITYPE_t chunks_n_threads
-
-        ITYPE_t n_samples_chunk, chunk_size
-
-        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
-        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
-
-        bint execute_in_parallel_on_Y
-
     @classmethod
     def valid_metrics(cls) -> List[str]:
         excluded = {
@@ -223,6 +232,10 @@ cdef class PairwiseDistancesReduction:
         }
         return sorted(set(METRIC_MAPPING.keys()) - excluded)
 
+    @classmethod
+    def valid_dtypes(cls):
+        return (np.float32, np.float64)
+
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
         """Return True if the PairwiseDistancesReduction can be used for the given parameters.
@@ -232,27 +245,406 @@ cdef class PairwiseDistancesReduction:
         X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
             Input data.
 
-        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
-            Input data.
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        Returns
+        -------
+        True if the PairwiseDistancesReduction can be used, else False.
+        """
+        dtypes_validity = X.dtype == Y.dtype and Y.dtype in cls.valid_dtypes()
+        return (get_config().get("enable_cython_pairwise_dist", True) and
+                not issparse(X) and not issparse(Y) and dtypes_validity and
+                metric in cls.valid_metrics())
+
+
+cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
+    """Compute the argkmin of row vectors of X on the ones of Y.
+
+    For each row vector of X, computes the indices of k first the rows
+    vectors of Y with the smallest distances.
+
+    PairwiseDistancesArgKmin is typically used to perform
+    bruteforce k-nearest neighbors queries.
+
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The dataset pairs (X, Y) for the reduction.
+
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use 256 if it is not set.
+
+    k: int, default=1
+        The k for the argkmin reduction.
+    """
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        ITYPE_t k,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+        bint return_distance=False,
+    ):
+        """Return the results of the reduction for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        k : int
+            The k for the argkmin reduction.
+
+        metric : str, default='euclidean'
+            The distance metric to use for argkmin.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+              Each thread then iterates on all the chunks of Y. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+              Each thread processes all the chunks of X in turn. This strategy is
+              a sequence of embarrassingly parallel subtasks (the inner loop on Y
+              chunks) with intermediate datastructures synchronisation at each
+              iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+              'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+              'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]`
+              is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity
+              for parallelism and is therefore more efficient despite the synchronization
+              step at each iteration of the outer loop on chunks of `X`.
+
+              - None (default) looks-up in scikit-learn configuration for
+              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its
+            argkmin if set to True.
+
+        Returns
+        -------
+            If return_distance=False:
+              - argkmin_indices : ndarray of shape (n_samples_X, k)
+                Indices of the argkmin for each vector in X.
+
+            If return_distance=True:
+              - argkmin_distances : ndarray of shape (n_samples_X, k)
+                Distances to the argkmin for each vector in X.
+              - argkmin_indices : ndarray of shape (n_samples_X, k)
+                Indices of the argkmin for each vector in X.
+
+        Notes
+        -----
+            This public classmethod is responsible for introspecting the arguments
+            values to dispatch to the proper implementations of
+            :meth:`PairwiseDistancesArgKmin` given the types.
+
+            All temporarily allocated datastructures necessary for the concrete
+            implementation are therefore freed when this classmethod returns.
+
+            This allows decoupling the interface entirely from the
+            implementation details whilst maintaining RAII.
+        """
+{{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}}
+        if X.dtype == Y.dtype == np.float{{bitness}}:
+            return PairwiseDistancesArgKmin{{bitness}}.compute(
+                X=X,
+                Y=Y,
+                k=k,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                return_distance=return_distance,
+            )
+{{endfor}}
+        raise ValueError(
+            f"Datasets must both be of np.float64 or np.float32 dtype. "
+            f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
+
+cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
+    """Compute radius-based neighbors for two sets of vectors.
+
+    For each row-vector X[i] of the queries X, find all the indices j of
+    row-vectors in Y such that:
+
+                        dist(X[i], Y[j]) <= radius
+
+    The distance function `dist` depends on the values of the `metric`
+    and `metric_kwargs` parameters.
+
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The dataset pair (X, Y) for the reduction.
+
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use 256 if it is not set.
+
+    radius: float
+        The radius defining the neighborhood.
+    """
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        DTYPE_t radius,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+        bint return_distance=False,
+        bint sort_results=False,
+    ):
+        """Return the results of the reduction for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        radius : float
+            The radius defining the neighborhood.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+              Each thread then iterates on all the chunks of Y. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+              Each thread processes all the chunks of X in turn. This strategy is
+              a sequence of embarrassingly parallel subtasks (the inner loop on Y
+              chunks) with intermediate datastructures synchronisation at each
+              iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+              'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+              'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]`
+              is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity
+              for parallelism and is therefore more efficient despite the synchronization
+              step at each iteration of the outer loop on chunks of `X`.
+
+              - None (default) looks-up in scikit-learn configuration for
+              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its neighbors if set to True.
+
+        sort_results : boolean, default=False
+            Sort results with respect to distances between each X vector and its
+            neighbors if set to True.
+
+        Returns
+        -------
+        If return_distance=False:
+          - neighbors_indices : ndarray of n_samples_X ndarray
+            Indices of the neighbors for each vector in X.
+
+        If return_distance=True:
+          - neighbors_indices : ndarray of n_samples_X ndarray
+            Indices of the neighbors for each vector in X.
+          - neighbors_distances : ndarray of n_samples_X ndarray
+            Distances to the neighbors for each vector in X.
+
+        Notes
+        -----
+        This public classmethod is responsible for introspecting the arguments
+        values to dispatch to the private
+        :meth:`PairwiseDistancesRadiusNeighborhood._compute` instance method of
+        the most appropriate :class:`PairwiseDistancesRadiusNeighborhood`
+        concrete implementation.
+
+        All temporarily allocated datastructures necessary for the concrete
+        implementation are therefore freed when this classmethod returns.
+
+        This allows entirely decoupling the interface entirely from the
+        implementation details whilst maintaining RAII.
+        """
+{{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}}
+        if X.dtype == Y.dtype == np.float{{bitness}}:
+            return PairwiseDistancesRadiusNeighborhood{{bitness}}.compute(
+                X=X,
+                Y=Y,
+                radius=radius,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                sort_results=sort_results,
+                return_distance=return_distance,
+            )
+{{endfor}}
+        raise ValueError(
+            f"Datasets must both be of np.float64 or np.float32 dtype. "
+            f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
+
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
+    const DTYPE_t[:, ::1] X,
+    ITYPE_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        # See: https://github.com/scipy/scipy/issues/14262
+        DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
+        ITYPE_t i = 0
+        ITYPE_t n = X.shape[0]
+        ITYPE_t d = X.shape[1]
+        DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
+
+    for i in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        squared_row_norms[i] = _dot(d, X_ptr + i * d, 1, X_ptr + i * d, 1)
+
+    return squared_row_norms
+
+
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms32(
+    const cnp.float32_t[:, ::1] X,
+    ITYPE_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        # See: https://github.com/scipy/scipy/issues/14262
+        cnp.float32_t * X_ptr = <cnp.float32_t *> &X[0, 0]
+        ITYPE_t i = 0, j = 0
+        ITYPE_t n = X.shape[0]
+        ITYPE_t d = X.shape[1]
+        DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
+
+        # To upcast the i-th row of X from 32bit to 64bit
+        DTYPE_t * X_idx_upcast_ptr
+
+    with nogil, parallel(num_threads=num_threads):
+        # Thread-local buffer allocation
+        X_i_upcast_ptr = <DTYPE_t* > malloc(sizeof(DTYPE_t) * d)
+        for i in prange(n, schedule='static'):
+
+            # Upcasting the i-th row of X from 32bit to 64bit
+            for j in range(d):
+                X_i_upcast_ptr[j] = <DTYPE_t> deref(X_ptr + i * d + j)
+
+            squared_row_norms[i] = _dot(d, X_i_upcast_ptr, 1, X_i_upcast_ptr, 1)
+
+        free(X_i_upcast_ptr)
+
+    return squared_row_norms
+
+#####################
+# dtype-specific implementations:
+#   For each dtype, an implementation of `PairwiseDistancesReductions` are generated by Tempita.
+#   Computations are dispatched to them at runtime via the interfaces defined above.
+
+{{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}}
+
+from ._dist_metrics cimport DatasetsPair{{distance_suffix}}, DenseDenseDatasetsPair{{distance_suffix}}
+
+cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction):
+    """{{bitness}}bit implementation of PairwiseDistancesReduction."""
+
+    cdef:
+        readonly DatasetsPair{{distance_suffix}} datasets_pair
+
+        # The number of threads that can be used is stored in effective_n_threads.
+        #
+        # The number of threads to use in the parallelisation strategy
+        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
+        # for small datasets, less threads might be needed to loop over pair of chunks.
+        #
+        # Hence the number of threads that _will_ be used for looping over chunks
+        # is stored in chunks_n_threads, allowing solely using what we need.
+        #
+        # Thus, an invariant is:
+        #
+        #                 chunks_n_threads <= effective_n_threads
+        #
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+
+        ITYPE_t n_samples_chunk, chunk_size
 
-        metric : str, default='euclidean'
-            The distance metric to use.
-            For a list of available metrics, see the documentation of
-            :class:`~sklearn.metrics.DistanceMetric`.
+        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
+        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
 
-        Returns
-        -------
-        True if the PairwiseDistancesReduction can be used, else False.
-        """
-        # TODO: support sparse arrays and 32 bits
-        return (get_config().get("enable_cython_pairwise_dist", True) and
-                not issparse(X) and X.dtype == np.float64 and
-                not issparse(Y) and Y.dtype == np.float64 and
-                metric in cls.valid_metrics())
+        bint execute_in_parallel_on_Y
 
     def __init__(
         self,
-        DatasetsPair datasets_pair,
+        DatasetsPair{{distance_suffix}} datasets_pair,
         chunk_size=None,
         strategy=None,
      ):
@@ -348,7 +740,8 @@ cdef class PairwiseDistancesReduction:
                     X_end = X_start + self.X_n_samples_chunk
 
                 # Reinitializing thread datastructures for the new X chunk
-                self._parallel_on_X_init_chunk(thread_num, X_start)
+                # Eventually upcast X[X_start:X_end] to 64bit
+                self._parallel_on_X_init_chunk(thread_num, X_start, X_end)
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -357,6 +750,13 @@ cdef class PairwiseDistancesReduction:
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
+                    # Eventually upcast Y[Y_start:Y_end] to 64bit
+                    self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
                     self._compute_and_reduce_distances_on_chunks(
                         X_start, X_end,
                         Y_start, Y_end,
@@ -409,7 +809,8 @@ cdef class PairwiseDistancesReduction:
                 thread_num = _openmp_thread_num()
 
                 # Initializing datastructures used in this thread
-                self._parallel_on_Y_parallel_init(thread_num)
+                # Eventually upcast X[X_start:X_end] to 64bit
+                self._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -418,6 +819,13 @@ cdef class PairwiseDistancesReduction:
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
+                    # Eventually upcast Y[Y_start:Y_end] to 64bit
+                    self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
                     self._compute_and_reduce_distances_on_chunks(
                         X_start, X_end,
                         Y_start, Y_end,
@@ -450,8 +858,9 @@ cdef class PairwiseDistancesReduction:
     ) nogil:
         """Compute the pairwise distances on two chunks of X and Y and reduce them.
 
-        This is THE core computational method of PairwiseDistanceReductions.
-        This must be implemented in subclasses.
+        This is THE core computational method of PairwiseDistanceReductions{{bitness}}.
+        This must be implemented in subclasses agnostically from the parallelisation
+        strategies.
         """
         return
 
@@ -479,10 +888,25 @@ cdef class PairwiseDistancesReduction:
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         """Initialise datastructures used in a thread given its number."""
         return
 
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks.
+
+        This is eventually used to upcast X[X_start:X_end] to 64bit.
+        """
+        return
+
     cdef void _parallel_on_X_prange_iter_finalize(
         self,
         ITYPE_t thread_num,
@@ -508,10 +932,26 @@ cdef class PairwiseDistancesReduction:
     cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         """Initialise datastructures used in a thread given its number."""
         return
 
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks.
+
+        This is eventually used to upcast Y[Y_start:Y_end] to 64bit.
+        """
+        return
+
     cdef void _parallel_on_Y_synchronize(
         self,
         ITYPE_t X_start,
@@ -526,28 +966,8 @@ cdef class PairwiseDistancesReduction:
         """Update datastructures after executing all the reductions."""
         return
 
-cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
-    """Compute the argkmin of row vectors of X on the ones of Y.
-
-    For each row vector of X, computes the indices of k first the rows
-    vectors of Y with the smallest distances.
-
-    PairwiseDistancesArgKmin is typically used to perform
-    bruteforce k-nearest neighbors queries.
-
-    Parameters
-    ----------
-    datasets_pair: DatasetsPair
-        The dataset pairs (X, Y) for the reduction.
-
-    chunk_size: int, default=None,
-        The number of vectors per chunk. If None (default) looks-up in
-        scikit-learn configuration for `pairwise_dist_chunk_size`,
-        and use 256 if it is not set.
-
-    k: int, default=1
-        The k for the argkmin reduction.
-    """
+cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitness}}):
+    """{{bitness}}bit implementation of PairwiseDistancesArgKmin."""
 
     cdef:
         ITYPE_t k
@@ -644,14 +1064,14 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         Notes
         -----
             This public classmethod is responsible for introspecting the arguments
-            values to dispatch to the private :meth:`PairwiseDistancesArgKmin._compute`
-            instance method of the most appropriate :class:`PairwiseDistancesArgKmin`
+            values to dispatch to the private :meth:`PairwiseDistancesArgKmin{{bitness}}._compute`
+            instance method of the most appropriate :class:`PairwiseDistancesArgKmin{{bitness}}`
             concrete implementation.
 
             All temporarily allocated datastructures necessary for the concrete
             implementation are therefore freed when this classmethod returns.
 
-            This allows entirely decoupling the interface entirely from the
+            This allows decoupling the interface entirely from the
             implementation details whilst maintaining RAII.
         """
         # Note (jjerphan): Some design thoughts for future extensions.
@@ -669,7 +1089,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             # at time to leverage a call to the BLAS GEMM routine as explained
             # in more details in the docstring.
             use_squared_distances = metric == "sqeuclidean"
-            pda = FastEuclideanPairwiseDistancesArgKmin(
+            pda = FastEuclideanPairwiseDistancesArgKmin{{bitness}}(
                 X=X, Y=Y, k=k,
                 use_squared_distances=use_squared_distances,
                 chunk_size=chunk_size,
@@ -679,8 +1099,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         else:
              # Fall back on a generic implementation that handles most scipy
              # metrics by computing the distances between 2 vectors at a time.
-            pda = PairwiseDistancesArgKmin(
-                datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+            pda = PairwiseDistancesArgKmin{{bitness}}(
+                datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric, metric_kwargs),
                 k=k,
                 chunk_size=chunk_size,
                 strategy=strategy,
@@ -698,7 +1118,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
     def __init__(
         self,
-        DatasetsPair datasets_pair,
+        DatasetsPair{{distance_suffix}} datasets_pair,
         chunk_size=None,
         strategy=None,
         ITYPE_t k=1,
@@ -726,7 +1146,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             sizeof(ITYPE_t *) * self.chunks_n_threads
         )
 
-        # Main heaps which will be returned as results by `PairwiseDistancesArgKmin.compute`.
+        # Main heaps which will be returned as results by `PairwiseDistancesArgKmin{{bitness}}.compute`.
         self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
 
@@ -764,11 +1184,11 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                     Y_start + j,
                 )
 
-    @final
     cdef void _parallel_on_X_init_chunk(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         # As this strategy is embarrassingly parallel, we can set each
         # thread's heaps pointer to the proper position on the main heaps.
@@ -819,10 +1239,11 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 heaps_size * sizeof(ITYPE_t)
             )
 
-    @final
     cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
         # Initialising heaps (memset can't be used here)
         for idx in range(self.X_n_samples_chunk * self.k):
@@ -899,17 +1320,17 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
             # Values are returned identically to the way `KNeighborsMixin.kneighbors`
             # returns values. This is counter-intuitive but this allows not using
-            # complex adaptations where `PairwiseDistancesArgKmin.compute` is called.
+            # complex adaptations where `PairwiseDistancesArgKmin{{bitness}}.compute` is called.
             return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
         return np.asarray(self.argkmin_indices)
 
 
-cdef class GEMMTermComputer:
+cdef class GEMMTermComputer{{bitness}}:
     """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM.
 
     `FastEuclidean*` classes internally compute the squared Euclidean distances between
-    chunks of vectors X_c and Y_c using using the decomposition:
+    chunks of vectors X_c and Y_c using the following decomposition:
 
 
                 ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
@@ -919,42 +1340,146 @@ cdef class GEMMTermComputer:
     the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
     arithmetic intensity.
     """
-
     cdef:
-        const DTYPE_t[:, ::1] X
-        const DTYPE_t[:, ::1] Y
+        const {{DTYPE_t}}[:, ::1] X
+        const {{DTYPE_t}}[:, ::1] Y
 
         ITYPE_t effective_n_threads
         ITYPE_t chunks_n_threads
         ITYPE_t dist_middle_terms_chunks_size
+        ITYPE_t n_features
+        ITYPE_t chunk_size
 
         # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
         vector[vector[DTYPE_t]] dist_middle_terms_chunks
 
+{{if need_upcast}}
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
+        vector[vector[DTYPE_t]] X_c_upcast
+        vector[vector[DTYPE_t]] Y_c_upcast
+{{endif}}
+
     def __init__(self,
-        DTYPE_t[:, ::1] X,
-        DTYPE_t[:, ::1] Y,
+        {{DTYPE_t}}[:, ::1] X,
+        {{DTYPE_t}}[:, ::1] Y,
         ITYPE_t effective_n_threads,
         ITYPE_t chunks_n_threads,
         ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
     ):
         self.X = X
         self.Y = Y
         self.effective_n_threads = effective_n_threads
         self.chunks_n_threads = chunks_n_threads
         self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
+        self.n_features = n_features
+        self.chunk_size = chunk_size
 
         self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
 
+{{if need_upcast}}
+        # We populate the buffer for upcasting chunks of X and Y from 32bit to 64bit.
+        self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
+        self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
+
+        upcast_buffer_n_elements = self.chunk_size * n_features
+
+        for thread_num in range(self.effective_n_threads):
+            self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+            self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+{{endif}}
+
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+{{else}}
+        return
+{{endif}}
+
     cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
         self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
 
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+{{else}}
+        return
+{{endif}}
+
     cdef void _parallel_on_Y_init(self) nogil:
         for thread_num in range(self.chunks_n_threads):
             self.dist_middle_terms_chunks[thread_num].resize(
                 self.dist_middle_terms_chunks_size
             )
 
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+{{else}}
+        return
+{{endif}}
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+{{if need_upcast}}
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+{{else}}
+        return
+{{endif}}
+
     cdef DTYPE_t * _compute_distances_on_chunks(
         self,
         ITYPE_t X_start,
@@ -966,9 +1491,8 @@ cdef class GEMMTermComputer:
         cdef:
             ITYPE_t i, j
             DTYPE_t squared_dist_i_j
-
-            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
-            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            const {{DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :]
+            const {{DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
 
             # Careful: LDA, LDB and LDC are given for F-ordered arrays
@@ -983,12 +1507,17 @@ cdef class GEMMTermComputer:
             ITYPE_t n = Y_c.shape[0]
             ITYPE_t K = X_c.shape[1]
             DTYPE_t alpha = - 2.
+{{if need_upcast}}
+            DTYPE_t * A = self.X_c_upcast[thread_num].data()
+            DTYPE_t * B = self.Y_c_upcast[thread_num].data()
+{{else}}
             # Casting for A and B to remove the const is needed because APIs exposed via
             # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
             # See: https://github.com/scipy/scipy/issues/14262
             DTYPE_t * A = <DTYPE_t *> &X_c[0, 0]
-            ITYPE_t lda = X_c.shape[1]
             DTYPE_t * B = <DTYPE_t *> &Y_c[0, 0]
+{{endif}}
+            ITYPE_t lda = X_c.shape[1]
             ITYPE_t ldb = X_c.shape[1]
             DTYPE_t beta = 0.
             ITYPE_t ldc = Y_c.shape[0]
@@ -999,25 +1528,11 @@ cdef class GEMMTermComputer:
         return dist_middle_terms
 
 
-cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
-    """Fast specialized variant for PairwiseDistancesArgKmin on EuclideanDistance.
-
-    The full pairwise squared distances matrix is computed as follows:
-
-                  ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||²
-
-    The middle term gets computed efficiently below using BLAS Level 3 GEMM.
-
-    Notes
-    -----
-    This implementation has a superior arithmetic intensity and hence
-    better running time when the variant is IO bound, but it can suffer
-    from numerical instability caused by catastrophic cancellation potentially
-    introduced by the subtraction in the arithmetic expression above.
-    """
-
+cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArgKmin{{bitness}}):
+    """Fast specialized alternative for PairwiseDistancesArgKmin{{bitness}} on EuclideanDistance."""
     cdef:
-        GEMMTermComputer gemm_term_computer
+        GEMMTermComputer{{bitness}} gemm_term_computer
+
         const DTYPE_t[::1] X_norm_squared
         const DTYPE_t[::1] Y_norm_squared
 
@@ -1025,7 +1540,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        return (PairwiseDistancesArgKmin.is_usable_for(X, Y, metric) and
+        return (PairwiseDistancesArgKmin{{bitness}}.is_usable_for(X, Y, metric) and
                 not _in_unstable_openblas_configuration())
 
     def __init__(
@@ -1052,57 +1567,125 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
 
         super().__init__(
             # The datasets pair here is used for exact distances computations
-            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+            datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric="euclidean"),
             chunk_size=chunk_size,
             strategy=strategy,
             k=k,
         )
-        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        # X and Y are checked by the DatasetsPair{{distance_suffix}} implemented as a DenseDenseDatasetsPair{{distance_suffix}}
         cdef:
-            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
+            DenseDenseDatasetsPair{{distance_suffix}} datasets_pair = (
+            <DenseDenseDatasetsPair{{distance_suffix}}> self.datasets_pair
+        )
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
-        self.gemm_term_computer = GEMMTermComputer(
+        self.gemm_term_computer = GEMMTermComputer{{bitness}}(
             datasets_pair.X,
             datasets_pair.Y,
             self.effective_n_threads,
             self.chunks_n_threads,
             dist_middle_terms_chunks_size,
+            n_features=datasets_pair.X.shape[1],
+            chunk_size=self.chunk_size,
         )
 
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared")
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms(datasets_pair.Y, self.effective_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms{{bitness}}(datasets_pair.Y, self.effective_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms(datasets_pair.X, self.effective_n_threads)
+            _sqeuclidean_row_norms{{bitness}}(datasets_pair.X, self.effective_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 
     @final
     cdef void compute_exact_distances(self) nogil:
         if not self.use_squared_distances:
-            PairwiseDistancesArgKmin.compute_exact_distances(self)
+            PairwiseDistancesArgKmin{{bitness}}.compute_exact_distances(self)
 
     @final
     cdef void _parallel_on_X_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
-        PairwiseDistancesArgKmin._parallel_on_X_parallel_init(self, thread_num)
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_parallel_init(self, thread_num)
         self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
 
+
+    @final
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+
+
+    @final
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num,
+        )
+
+
     @final
     cdef void _parallel_on_Y_init(
         self,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin._parallel_on_Y_init(self)
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_init(self)
         self.gemm_term_computer._parallel_on_Y_init()
 
+
+    @final
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+
+
+    @final
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num
+        )
+
+
     @final
     cdef void _compute_and_reduce_distances_on_chunks(
         self,
@@ -1145,7 +1728,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
                 )
 
 
-cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
+cdef class PairwiseDistancesRadiusNeighborhood{{bitness}}(PairwiseDistancesReduction{{bitness}}):
     """Compute radius-based neighbors for two sets of vectors.
 
     For each row-vector X[i] of the queries X, find all the indices j of
@@ -1321,7 +1904,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
             # at time to leverage a call to the BLAS GEMM routine as explained
             # in more details in the docstring.
             use_squared_distances = metric == "sqeuclidean"
-            pda = FastEuclideanPairwiseDistancesRadiusNeighborhood(
+            pda = FastEuclideanPairwiseDistancesRadiusNeighborhood{{bitness}}(
                 X=X, Y=Y, radius=radius,
                 use_squared_distances=use_squared_distances,
                 chunk_size=chunk_size,
@@ -1332,8 +1915,8 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
         else:
              # Fall back on a generic implementation that handles most scipy
              # metrics by computing the distances between 2 vectors at a time.
-            pda = PairwiseDistancesRadiusNeighborhood(
-                datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+            pda = PairwiseDistancesRadiusNeighborhood{{bitness}}(
+                datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric, metric_kwargs),
                 radius=radius,
                 chunk_size=chunk_size,
                 metric_kwargs=metric_kwargs,
@@ -1354,7 +1937,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 
     def __init__(
         self,
-        DatasetsPair datasets_pair,
+        DatasetsPair{{distance_suffix}} datasets_pair,
         DTYPE_t radius,
         chunk_size=None,
         strategy=None,
@@ -1423,11 +2006,11 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
 
         return coerce_vectors_to_nd_arrays(self.neigh_indices)
 
-    @final
     cdef void _parallel_on_X_init_chunk(
         self,
         ITYPE_t thread_num,
         ITYPE_t X_start,
+        ITYPE_t X_end,
     ) nogil:
 
         # As this strategy is embarrassingly parallel, we can set the
@@ -1546,7 +2129,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
                 )
 
 
-cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRadiusNeighborhood):
+cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood{{bitness}}(PairwiseDistancesRadiusNeighborhood{{bitness}}):
     """Fast specialized variant for PairwiseDistancesRadiusNeighborhood on EuclideanDistance.
 
     The full pairwise squared distances matrix is computed as follows:
@@ -1565,7 +2148,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
     """
 
     cdef:
-        GEMMTermComputer gemm_term_computer
+        GEMMTermComputer{{bitness}} gemm_term_computer
         const DTYPE_t[::1] X_norm_squared
         const DTYPE_t[::1] Y_norm_squared
 
@@ -1573,7 +2156,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
-        return (PairwiseDistancesRadiusNeighborhood.is_usable_for(X, Y, metric)
+        return (PairwiseDistancesRadiusNeighborhood{{bitness}}.is_usable_for(X, Y, metric)
                 and not _in_unstable_openblas_configuration())
 
     def __init__(
@@ -1601,7 +2184,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
 
         super().__init__(
             # The datasets pair here is used for exact distances computations
-            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+            datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric="euclidean"),
             radius=radius,
             chunk_size=chunk_size,
             strategy=strategy,
@@ -1610,26 +2193,28 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
         )
         # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
         cdef:
-            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
+            DenseDenseDatasetsPair{{distance_suffix}} datasets_pair = <DenseDenseDatasetsPair{{distance_suffix}}> self.datasets_pair
             ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
 
-        self.gemm_term_computer = GEMMTermComputer(
+        self.gemm_term_computer = GEMMTermComputer{{bitness}}(
             datasets_pair.X,
             datasets_pair.Y,
             self.effective_n_threads,
             self.chunks_n_threads,
             dist_middle_terms_chunks_size,
+            n_features=datasets_pair.X.shape[1],
+            chunk_size=self.chunk_size,
         )
 
         if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
             self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared")
         else:
-            self.Y_norm_squared = _sqeuclidean_row_norms(datasets_pair.Y, self.effective_n_threads)
+            self.Y_norm_squared = _sqeuclidean_row_norms{{bitness}}(datasets_pair.Y, self.effective_n_threads)
 
         # Do not recompute norms if datasets are identical.
         self.X_norm_squared = (
             self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms(datasets_pair.X, self.effective_n_threads)
+            _sqeuclidean_row_norms{{bitness}}(datasets_pair.X, self.effective_n_threads)
         )
         self.use_squared_distances = use_squared_distances
 
@@ -1638,27 +2223,85 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
             # already considered to be the adapted radius, so we overwrite it.
             self.r_radius = radius
 
-    @final
-    cdef void compute_exact_distances(self) nogil:
-        if not self.use_squared_distances:
-            PairwiseDistancesRadiusNeighborhood.compute_exact_distances(self)
-
     @final
     cdef void _parallel_on_X_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
-        PairwiseDistancesRadiusNeighborhood._parallel_on_X_parallel_init(self, thread_num)
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_X_parallel_init(self, thread_num)
         self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
 
+    @final
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+
+    @final
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num,
+        )
+
     @final
     cdef void _parallel_on_Y_init(
         self,
     ) nogil:
         cdef ITYPE_t thread_num
-        PairwiseDistancesRadiusNeighborhood._parallel_on_Y_init(self)
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_Y_init(self)
         self.gemm_term_computer._parallel_on_Y_init()
 
+    @final
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+
+    @final
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num
+        )
+
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        if not self.use_squared_distances:
+            PairwiseDistancesRadiusNeighborhood{{bitness}}.compute_exact_distances(self)
+
     @final
     cdef void _compute_and_reduce_distances_on_chunks(
         self,
@@ -1692,3 +2335,4 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad
                 if squared_dist_i_j <= self.r_radius:
                     deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
                     deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
+{{endfor}}
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index fc912068cb6c4..c343cadb2c258 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -24,6 +24,7 @@ def configuration(parent_package="", top_path=None):
     templates = [
         "sklearn/metrics/_dist_metrics.pyx.tp",
         "sklearn/metrics/_dist_metrics.pxd.tp",
+        "sklearn/metrics/_pairwise_distances_reduction.pyx.tp",
     ]
 
     gen_from_templates(templates)
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 192f7ef43a6c6..7bfd89dc06800 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -9,7 +9,8 @@
     PairwiseDistancesReduction,
     PairwiseDistancesArgKmin,
     PairwiseDistancesRadiusNeighborhood,
-    _sqeuclidean_row_norms,
+    _sqeuclidean_row_norms64,
+    _sqeuclidean_row_norms32,
 )
 
 from sklearn.metrics import euclidean_distances
@@ -66,7 +67,7 @@ def _get_metric_params_list(metric: str, n_features: int, seed: int = 1):
     return [{}]
 
 
-def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
+def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1e-7):
     assert_array_equal(
         ref_indices,
         indices,
@@ -76,10 +77,69 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
         ref_dist,
         dist,
         err_msg="Query vectors have different neighbors' distances",
-        rtol=1e-7,
+        rtol=rtol,
     )
 
 
+def assert_argkmin_results_quasi_equality(
+    ref_dist, dist, ref_indices, indices, rtol=1e-4
+):
+
+    ref_dist, dist, ref_indices, indices = map(
+        np.ndarray.flatten, [ref_dist, dist, ref_indices, indices]
+    )
+
+    assert (
+        len(ref_dist) == len(dist) == len(ref_indices) == len(indices)
+    ), "Arrays of results have various length."
+
+    n = len(ref_dist)
+
+    skip_permutation_check = False
+
+    for i in range(n - 1):
+        # We test the equality of pair of adjacent indices and distances
+        # of the references against the results.
+        rd_prev, rd_current, rd_next = ref_dist[i - 1], ref_dist[i], ref_dist[i + 1]
+        d_prev, d_current, d_next = dist[i - 1], dist[i], dist[i + 1]
+        ri_prev, ri_current, ri_next = (
+            ref_indices[i - 1],
+            ref_indices[i],
+            ref_indices[i + 1],
+        )
+        i_prev, i_current, i_next = indices[i - 1], indices[i], indices[i + 1]
+
+        assert np.isclose(
+            d_current, rd_current, rtol=rtol
+        ), "Query vectors have different neighbors' distances"
+
+        if ri_current != i_current:
+            # If the current reference index and index are different,
+            # it might be that their were permuted because their distances
+            # are relatively close to each other.
+            # In this case, we need to check for a valid permutation.
+            valid_permutation = (
+                np.isclose(d_current, d_next, rtol=rtol)
+                and i_next == ri_current
+                and ri_next == i_current
+            )
+            assert skip_permutation_check or valid_permutation, (
+                "Query vectors have different neighbors' indices \n"
+                f"(i_prev, i_current, i_next) = {i_prev, i_current, i_next} \n"
+                f"(ri_prev, ri_current, ri_next) = {ri_prev, ri_current, ri_next} \n"
+                f"(d_prev, d_current, d_next) = {d_prev, d_current, d_next} \n"
+                f"(rd_prev, rd_current, rd_next) = {rd_prev, rd_current, rd_next} \n"
+            )
+            # If there's a permutation at this iteration, we need to
+            # skip the following permutation check.
+            skip_permutation_check = True
+            continue
+
+        # We need to check for potential permutations for the next iterations.
+        if skip_permutation_check:
+            skip_permutation_check = False
+
+
 def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices):
     # We get arrays of arrays and we need to check for individual pairs
     for i in range(ref_dist.shape[0]):
@@ -97,8 +157,20 @@ def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, ind
 
 
 ASSERT_RESULT = {
-    PairwiseDistancesArgKmin: assert_argkmin_results_equality,
-    PairwiseDistancesRadiusNeighborhood: assert_radius_neighborhood_results_equality,
+    # In the case of 64bit, we test for exact equality.
+    (PairwiseDistancesArgKmin, np.float64): assert_argkmin_results_equality,
+    (
+        PairwiseDistancesRadiusNeighborhood,
+        np.float64,
+    ): assert_radius_neighborhood_results_equality,
+    # In the case of 32bit, indices can be permuted due to small difference
+    # in the computations of their associated distances, hence we test equality of
+    # results up to valid permutations.
+    (PairwiseDistancesArgKmin, np.float32): assert_argkmin_results_quasi_equality,
+    (
+        PairwiseDistancesRadiusNeighborhood,
+        np.float32,
+    ): assert_radius_neighborhood_results_equality,
 }
 
 
@@ -107,13 +179,18 @@ def test_pairwise_distances_reduction_is_usable_for():
     X = rng.rand(100, 10)
     Y = rng.rand(100, 10)
     metric = "euclidean"
-    assert PairwiseDistancesReduction.is_usable_for(X, Y, metric)
+
+    assert PairwiseDistancesReduction.is_usable_for(
+        X.astype(np.float64), X.astype(np.float64), metric
+    )
+    assert PairwiseDistancesReduction.is_usable_for(
+        X.astype(np.float32), X.astype(np.float32), metric
+    )
     assert not PairwiseDistancesReduction.is_usable_for(
         X.astype(np.int64), Y.astype(np.int64), metric
     )
 
     assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc")
-    # TODO: remove once 32 bits datasets are supported
     assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric)
     assert not PairwiseDistancesReduction.is_usable_for(X, Y.astype(np.int32), metric)
 
@@ -130,8 +207,8 @@ def test_argkmin_factory_method_wrong_usages():
     metric = "euclidean"
 
     msg = (
-        "Only 64bit float datasets are supported at this time, "
-        "got: X.dtype=float32 and Y.dtype=float64"
+        "Datasets must both be of np.float64 or np.float32 dtype. Currently: "
+        "X.dtype=float32 and Y.dtype=float64."
     )
     with pytest.raises(ValueError, match=msg):
         PairwiseDistancesArgKmin.compute(
@@ -139,8 +216,8 @@ def test_argkmin_factory_method_wrong_usages():
         )
 
     msg = (
-        "Only 64bit float datasets are supported at this time, "
-        "got: X.dtype=float64 and Y.dtype=int32"
+        "Datasets must both be of np.float64 or np.float32 dtype. Currently: "
+        "X.dtype=float64 and Y.dtype=int32"
     )
     with pytest.raises(ValueError, match=msg):
         PairwiseDistancesArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
@@ -171,7 +248,7 @@ def test_argkmin_factory_method_wrong_usages():
     message = (
         r"Some metric_kwargs have been passed \({'p': 3}\) but aren't usable for this"
         r" case \("
-        r"FastEuclideanPairwiseDistancesArgKmin\) and will be ignored."
+        r"FastEuclideanPairwiseDistancesArgKmin."
     )
 
     with pytest.warns(UserWarning, match=message):
@@ -190,8 +267,8 @@ def test_radius_neighborhood_factory_method_wrong_usages():
     with pytest.raises(
         ValueError,
         match=(
-            "Only 64bit float datasets are supported at this time, "
-            "got: X.dtype=float32 and Y.dtype=float64"
+            "Datasets must both be of np.float64 or np.float32 dtype. "
+            "Currently: X.dtype=float32 and Y.dtype=float64"
         ),
     ):
         PairwiseDistancesRadiusNeighborhood.compute(
@@ -201,8 +278,8 @@ def test_radius_neighborhood_factory_method_wrong_usages():
     with pytest.raises(
         ValueError,
         match=(
-            "Only 64bit float datasets are supported at this time, "
-            "got: X.dtype=float64 and Y.dtype=int32"
+            "Datasets must both be of np.float64 or np.float32 dtype. "
+            "Currently: X.dtype=float64 and Y.dtype=int32"
         ),
     ):
         PairwiseDistancesRadiusNeighborhood.compute(
@@ -233,8 +310,7 @@ def test_radius_neighborhood_factory_method_wrong_usages():
 
     message = (
         r"Some metric_kwargs have been passed \({'p': 3}\) but aren't usable for this"
-        r" case \(FastEuclideanPairwiseDistancesRadiusNeighborhood\) and will be"
-        r" ignored."
+        r" case \(FastEuclideanPairwiseDistancesRadiusNeighborhood"
     )
 
     with pytest.warns(UserWarning, match=message):
@@ -245,6 +321,7 @@ def test_radius_neighborhood_factory_method_wrong_usages():
 
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
+@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
     [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
@@ -254,8 +331,8 @@ def test_chunk_size_agnosticism(
     PairwiseDistancesReduction,
     n_samples,
     chunk_size,
+    dtype,
     n_features=100,
-    dtype=np.float64,
 ):
     # Results should not depend on the chunk size
     rng = np.random.RandomState(global_random_seed)
@@ -274,6 +351,7 @@ def test_chunk_size_agnosticism(
         X,
         Y,
         parameter,
+        metric="manhattan",
         return_distance=True,
     )
 
@@ -282,25 +360,27 @@ def test_chunk_size_agnosticism(
         Y,
         parameter,
         chunk_size=chunk_size,
+        metric="manhattan",
         return_distance=True,
     )
 
-    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
+    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](
+        ref_dist, dist, ref_indices, indices
+    )
 
 
 @pytest.mark.parametrize("n_samples", [100, 1000])
-@pytest.mark.parametrize("chunk_size", [50, 512, 1024])
+@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
     [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
 )
 def test_n_threads_agnosticism(
     global_random_seed,
+    dtype,
     PairwiseDistancesReduction,
     n_samples,
-    chunk_size,
     n_features=100,
-    dtype=np.float64,
 ):
     # Results should not depend on the number of threads
     rng = np.random.RandomState(global_random_seed)
@@ -327,13 +407,62 @@ def test_n_threads_agnosticism(
             X, Y, parameter, return_distance=True
         )
 
-    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
+    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](
+        ref_dist, dist, ref_indices, indices
+    )
+
+
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin],
+)
+def test_dtype_agnosticism(
+    PairwiseDistancesReduction,
+    seed,
+    metric,
+    n_samples=1000,
+    n_features=100,
+):
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X64 = rng.rand(n_samples, n_features).astype(np.float64) * spread
+    Y64 = rng.rand(n_samples, n_features).astype(np.float64) * spread
+    X32 = X64.astype(np.float32)
+    Y32 = Y64.astype(np.float32)
+
+    parameter = (
+        10
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
+        # Scaling the radius slightly with the numbers of dimensions
+        else 10 ** np.log(n_features)
+    )
+
+    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
+        X64,
+        Y64,
+        parameter,
+        return_distance=True,
+    )
+
+    dist, indices = PairwiseDistancesReduction.compute(
+        X32, Y32, parameter, return_distance=True
+    )
+
+    # We check results against np.float32 because we inherently
+    # loose the information from np.float64.
+    dist = dist.astype(ref_dist.dtype)
+    ASSERT_RESULT[(PairwiseDistancesReduction, np.float32)](
+        ref_dist, dist, ref_indices, indices
+    )
 
 
 # TODO: Remove filterwarnings in 1.3 when wminkowski is removed
 @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
+@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
     [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
@@ -343,8 +472,8 @@ def test_strategies_consistency(
     PairwiseDistancesReduction,
     metric,
     n_samples,
+    dtype,
     n_features=10,
-    dtype=np.float64,
 ):
 
     rng = np.random.RandomState(global_random_seed)
@@ -394,7 +523,7 @@ def test_strategies_consistency(
         return_distance=True,
     )
 
-    ASSERT_RESULT[PairwiseDistancesReduction](
+    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](
         dist_par_X,
         dist_par_Y,
         indices_par_X,
@@ -409,6 +538,7 @@ def test_strategies_consistency(
 @pytest.mark.parametrize("n_features", [50, 500])
 @pytest.mark.parametrize("translation", [0, 1e6])
 @pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
+@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes())
 @pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
 def test_pairwise_distances_argkmin(
     global_random_seed,
@@ -416,9 +546,9 @@ def test_pairwise_distances_argkmin(
     translation,
     metric,
     strategy,
+    dtype,
     n_samples=100,
     k=10,
-    dtype=np.float64,
 ):
     rng = np.random.RandomState(global_random_seed)
     spread = 1000
@@ -439,7 +569,7 @@ def test_pairwise_distances_argkmin(
     else:
         dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
     # Taking argkmin (indices of the k smallest values)
-    argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
+    argkmin_indices_ref = np.argsort(dist_matrix, kind="mergesort", axis=1)[:, :k]
     # Getting the associated distances
     argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64)
     for row_idx in range(argkmin_indices_ref.shape[0]):
@@ -459,8 +589,11 @@ def test_pairwise_distances_argkmin(
         strategy=strategy,
     )
 
-    ASSERT_RESULT[PairwiseDistancesArgKmin](
-        argkmin_distances, argkmin_distances_ref, argkmin_indices, argkmin_indices_ref
+    ASSERT_RESULT[(PairwiseDistancesArgKmin, dtype)](
+        argkmin_distances,
+        argkmin_distances_ref,
+        argkmin_indices,
+        argkmin_indices_ref,
     )
 
 
@@ -526,7 +659,7 @@ def test_pairwise_distances_radius_neighbors(
         sort_results=True,
     )
 
-    ASSERT_RESULT[PairwiseDistancesRadiusNeighborhood](
+    ASSERT_RESULT[(PairwiseDistancesRadiusNeighborhood, dtype)](
         neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref
     )
 
@@ -539,13 +672,15 @@ def test_sqeuclidean_row_norms(
     n_samples,
     n_features,
     num_threads,
-    dtype=np.float64,
 ):
     rng = np.random.RandomState(global_random_seed)
     spread = 100
-    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    X64 = rng.rand(n_samples, n_features).astype(np.float64) * spread
+    X32 = X64.astype(np.float32)
 
-    sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
-    sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads))
+    sq_row_norm_reference = np.linalg.norm(X64, axis=1) ** 2
+    sq_row_norm64 = np.asarray(_sqeuclidean_row_norms64(X64, num_threads=num_threads))
+    sq_row_norm32 = np.asarray(_sqeuclidean_row_norms32(X32, num_threads=num_threads))
 
-    assert_allclose(sq_row_norm_reference, sq_row_norm)
+    assert_allclose(sq_row_norm_reference, sq_row_norm64, rtol=1e-7)
+    assert_allclose(sq_row_norm_reference, sq_row_norm32, rtol=1e-6)