diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp
index 32ba546672c6e..ef23f2af50ffb 100644
--- a/sklearn/metrics/_dist_metrics.pxd.tp
+++ b/sklearn/metrics/_dist_metrics.pxd.tp
@@ -101,23 +101,3 @@ cdef class DistanceMetric{{name_suffix}}:
     cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1
 
 {{endfor}}
-
-######################################################################
-# DatasetsPair base class
-cdef class DatasetsPair:
-    cdef DistanceMetric distance_metric
-
-    cdef ITYPE_t n_samples_X(self) nogil
-
-    cdef ITYPE_t n_samples_Y(self) nogil
-
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
-
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
-
-
-cdef class DenseDenseDatasetsPair(DatasetsPair):
-    cdef:
-        const DTYPE_t[:, ::1] X
-        const DTYPE_t[:, ::1] Y
-        ITYPE_t d
diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp
index 5986fa939b45d..47bd1dcbab519 100644
--- a/sklearn/metrics/_dist_metrics.pyx.tp
+++ b/sklearn/metrics/_dist_metrics.pyx.tp
@@ -32,7 +32,6 @@ implementation_specific_values = [
 
 import numpy as np
 cimport numpy as cnp
-from cython cimport final
 
 cnp.import_array()  # required in order to use C-API
 
@@ -1171,163 +1170,3 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}):
                             "vectors and return a float.")
 
 {{endfor}}
-
-######################################################################
-# Datasets Pair Classes
-cdef class DatasetsPair:
-    """Abstract class which wraps a pair of datasets (X, Y).
-
-    This class allows computing distances between a single pair of rows of
-    of X and Y at a time given the pair of their indices (i, j). This class is
-    specialized for each metric thanks to the :func:`get_for` factory classmethod.
-
-    The handling of parallelization over chunks to compute the distances
-    and aggregation for several rows at a time is done in dedicated
-    subclasses of PairwiseDistancesReduction that in-turn rely on
-    subclasses of DatasetsPair for each pair of rows in the data. The goal
-    is to make it possible to decouple the generic parallelization and
-    aggregation logic from metric-specific computation as much as
-    possible.
-
-    X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
-    in subclasses.
-
-    This class avoids the overhead of dispatching distance computations
-    to :class:`sklearn.metrics.DistanceMetric` based on the physical
-    representation of the vectors (sparse vs. dense). It makes use of
-    cython.final to remove the overhead of dispatching method calls.
-
-    Parameters
-    ----------
-    distance_metric: DistanceMetric
-        The distance metric responsible for computing distances
-        between two vectors of (X, Y).
-    """
-
-    @classmethod
-    def get_for(
-        cls,
-        X,
-        Y,
-        str metric="euclidean",
-        dict metric_kwargs=None,
-    ) -> DatasetsPair:
-        """Return the DatasetsPair implementation for the given arguments.
-
-        Parameters
-        ----------
-        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
-
-        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-            If provided as a sparse matrix, it must be in CSR format.
-
-        metric : str, default='euclidean'
-            The distance metric to compute between rows of X and Y.
-            The default metric is a fast implementation of the Euclidean
-            metric. For a list of available metrics, see the documentation
-            of :class:`~sklearn.metrics.DistanceMetric`.
-
-        metric_kwargs : dict, default=None
-            Keyword arguments to pass to specified metric function.
-
-        Returns
-        -------
-        datasets_pair: DatasetsPair
-            The suited DatasetsPair implementation.
-        """
-        cdef:
-            DistanceMetric distance_metric = DistanceMetric.get_metric(
-                metric,
-                **(metric_kwargs or {})
-            )
-
-        if not(X.dtype == Y.dtype == np.float64):
-            raise ValueError(
-                f"Only 64bit float datasets are supported at this time, "
-                f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
-            )
-
-        # Metric-specific checks that do not replace nor duplicate `check_array`.
-        distance_metric._validate_data(X)
-        distance_metric._validate_data(Y)
-
-        # TODO: dispatch to other dataset pairs for sparse support once available:
-        if issparse(X) or issparse(Y):
-            raise ValueError("Only dense datasets are supported for X and Y.")
-
-        return DenseDenseDatasetsPair(X, Y, distance_metric)
-
-    def __init__(self, DistanceMetric distance_metric):
-        self.distance_metric = distance_metric
-
-    cdef ITYPE_t n_samples_X(self) nogil:
-        """Number of samples in X."""
-        # This is a abstract method.
-        # This _must_ always be overwritten in subclasses.
-        # TODO: add "with gil: raise" here when supporting Cython 3.0
-        return -999
-
-    cdef ITYPE_t n_samples_Y(self) nogil:
-        """Number of samples in Y."""
-        # This is a abstract method.
-        # This _must_ always be overwritten in subclasses.
-        # TODO: add "with gil: raise" here when supporting Cython 3.0
-        return -999
-
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        return self.dist(i, j)
-
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        # This is a abstract method.
-        # This _must_ always be overwritten in subclasses.
-        # TODO: add "with gil: raise" here when supporting Cython 3.0
-        return -1
-
-@final
-cdef class DenseDenseDatasetsPair(DatasetsPair):
-    """Compute distances between row vectors of two arrays.
-
-    Parameters
-    ----------
-    X: ndarray of shape (n_samples_X, n_features)
-        Rows represent vectors. Must be C-contiguous.
-
-    Y: ndarray of shape (n_samples_Y, n_features)
-        Rows represent vectors. Must be C-contiguous.
-
-    distance_metric: DistanceMetric
-        The distance metric responsible for computing distances
-        between two row vectors of (X, Y).
-    """
-
-    def __init__(self, X, Y, DistanceMetric distance_metric):
-        super().__init__(distance_metric)
-        # Arrays have already been checked
-        self.X = X
-        self.Y = Y
-        self.d = X.shape[1]
-
-    @final
-    cdef ITYPE_t n_samples_X(self) nogil:
-        return self.X.shape[0]
-
-    @final
-    cdef ITYPE_t n_samples_Y(self) nogil:
-        return self.Y.shape[0]
-
-    @final
-    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        return self.distance_metric.rdist(&self.X[i, 0],
-                                          &self.Y[j, 0],
-                                          self.d)
-
-    @final
-    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
-        return self.distance_metric.dist(&self.X[i, 0],
-                                         &self.Y[j, 0],
-                                         self.d)
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
deleted file mode 100644
index 9606eb1273ce8..0000000000000
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ /dev/null
@@ -1,1992 +0,0 @@
-# Pairwise Distances Reductions
-# =============================
-#
-#    Author: Julien Jerphanion <git@jjerphan.xyz>
-#
-# Overview
-# --------
-#
-#    This module provides routines to compute pairwise distances between a set
-#    of row vectors of X and another set of row vectors of Y and apply a
-#    reduction on top. The canonical example is the brute-force computation
-#    of the top k nearest neighbors by leveraging the arg-k-min reduction.
-#
-#    The reduction takes a matrix of pairwise distances between rows of X and Y
-#    as input and outputs an aggregate data-structure for each row of X. The
-#    aggregate values are typically smaller than the number of rows in Y, hence
-#    the term reduction.
-#
-#    For computational reasons, the reduction are performed on the fly on chunks
-#    of rows of X and Y so as to keep intermediate data-structures in CPU cache
-#    and avoid unnecessary round trips of large distance arrays with the RAM
-#    that would otherwise severely degrade the speed by making the overall
-#    processing memory-bound.
-#
-#    Finally, the routines follow a generic parallelization template to process
-#    chunks of data with OpenMP loops (via Cython prange), either on rows of X
-#    or rows of Y depending on their respective sizes.
-#
-#
-# Dispatching to specialized implementations
-# ------------------------------------------
-#
-#    Dispatchers are meant to be used in the Python code. Under the hood, a
-#    dispatcher must only define the logic to choose at runtime to the correct
-#    dtype-specialized :class:`PairwiseDistancesReduction` implementation based
-#    on the dtype of X and of Y.
-#
-#
-# High-level diagram
-# ------------------
-#
-#   Legend:
-#
-#      A ---⊳ B: A inherits from B
-#      A ---x B: A dispatches on B
-#
-#
-#                               (base dispatcher)
-#                           PairwiseDistancesReduction
-#                                       ∆
-#                                       |
-#                                       |
-#                     +-----------------+-----------------+
-#                     |                                   |
-#               (dispatcher)                        (dispatcher)
-#         PairwiseDistancesArgKmin           PairwiseDistancesRadiusNeighbors
-#               |                                              |
-#               |                                              |
-#               |                                              |
-#               |                  (64bit implem.)             |
-#               |          PairwiseDistancesReduction64        |
-#               |                       ∆                      |
-#               |                       |                      |
-#               |                       |                      |
-#               |     +-----------------+-----------------+    |
-#               |     |                                   |    |
-#               |     |                                   |    |
-#               x     |                                   |    x
-#        PairwiseDistancesArgKmin64       PairwiseDistancesRadiusNeighbors64
-#               |     ∆                                   ∆    |
-#               |     |                                   |    |
-#               x     |                                   |    |
-#     FastEuclideanPairwiseDistancesArgKmin64             |    |
-#                                                         |    |
-#                                                         |    x
-#                                  FastEuclideanPairwiseDistancesRadiusNeighbors64
-#
-#     For instance :class:`PairwiseDistancesArgKmin`, dispatches to
-#     :class:`PairwiseDistancesArgKmin64` if X and Y are both dense NumPy arrays
-#     with a float64 dtype.
-#
-#     In addition, if the metric parameter is set to "euclidean" or "sqeuclidean",
-#     :class:`PairwiseDistancesArgKmin64` further dispatches to
-#     :class:`FastEuclideanPairwiseDistancesArgKmin64` a specialized subclass
-#     to optimally handle the Euclidean distance case using the Generalized Matrix
-#     Multiplication (see the docstring of :class:`GEMMTermComputer64` for details).
-from abc import abstractmethod
-
-cimport numpy as cnp
-import numpy as np
-import warnings
-
-from .. import get_config
-from libc.stdlib cimport free, malloc
-from libc.float cimport DBL_MAX
-from libcpp.memory cimport shared_ptr, make_shared
-from libcpp.vector cimport vector
-from cython cimport final
-from cython.operator cimport dereference as deref
-from cython.parallel cimport parallel, prange
-
-from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair
-from ..utils._cython_blas cimport (
-  BLAS_Order,
-  BLAS_Trans,
-  ColMajor,
-  NoTrans,
-  RowMajor,
-  Trans,
-  _dot,
-  _gemm,
-)
-from ..utils._heap cimport heap_push
-from ..utils._sorting cimport simultaneous_sort
-from ..utils._openmp_helpers cimport _openmp_thread_num
-from ..utils._typedefs cimport ITYPE_t, DTYPE_t
-from ..utils._vector_sentinel cimport vector_to_nd_array
-
-from numbers import Integral, Real
-from typing import List
-from scipy.sparse import issparse
-from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
-from ..utils import check_scalar, _in_unstable_openblas_configuration
-from ..utils.fixes import threadpool_limits
-from ..utils._openmp_helpers import _openmp_effective_n_threads
-from ..utils._typedefs import ITYPE, DTYPE
-
-cnp.import_array()
-
-# TODO: change for `libcpp.algorithm.move` once Cython 3 is used
-# Introduction in Cython:
-# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa
-cdef extern from "<algorithm>" namespace "std" nogil:
-    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa
-
-######################
-## std::vector to np.ndarray coercion
-# As type covariance is not supported for C++ containers via Cython,
-# we need to redefine fused types.
-ctypedef fused vector_DITYPE_t:
-    vector[ITYPE_t]
-    vector[DTYPE_t]
-
-
-ctypedef fused vector_vector_DITYPE_t:
-    vector[vector[ITYPE_t]]
-    vector[vector[DTYPE_t]]
-
-
-cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
-    shared_ptr[vector_vector_DITYPE_t] vecs
-):
-    """Coerce a std::vector of std::vector to a ndarray of ndarray."""
-    cdef:
-        ITYPE_t n = deref(vecs).size()
-        cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray)
-
-    for i in range(n):
-        nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
-
-    return nd_arrays_of_nd_arrays
-
-#####################
-# Dispatchers
-
-class PairwiseDistancesReduction:
-    """Abstract base dispatcher for pairwise distance computation & reduction.
-
-    Each dispatcher extending the base :class:`PairwiseDistancesReduction`
-    dispatcher must implement the :meth:`compute` classmethod.
-    """
-
-    @classmethod
-    def valid_metrics(cls) -> List[str]:
-        excluded = {
-            "pyfunc",  # is relatively slow because we need to coerce data as np arrays
-            "mahalanobis", # is numerically unstable
-            # TODO: In order to support discrete distance metrics, we need to have a
-            # stable simultaneous sort which preserves the order of the input.
-            # The best might be using std::stable_sort and a Comparator taking an
-            # Arrays of Structures instead of Structure of Arrays (currently used).
-            "hamming",
-            *BOOL_METRICS,
-        }
-        return sorted(set(METRIC_MAPPING.keys()) - excluded)
-
-    @classmethod
-    def is_usable_for(cls, X, Y, metric) -> bool:
-        """Return True if the PairwiseDistancesReduction can be used for the
-        given parameters.
-
-        Parameters
-        ----------
-        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
-            Input data.
-
-        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
-            Input data.
-
-        metric : str, default='euclidean'
-            The distance metric to use.
-            For a list of available metrics, see the documentation of
-            :class:`~sklearn.metrics.DistanceMetric`.
-
-        Returns
-        -------
-        True if the PairwiseDistancesReduction can be used, else False.
-        """
-        dtypes_validity = X.dtype == Y.dtype and Y.dtype == np.float64
-        return (get_config().get("enable_cython_pairwise_dist", True) and
-                not issparse(X) and not issparse(Y) and dtypes_validity and
-                metric in cls.valid_metrics())
-
-    @classmethod
-    @abstractmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        **kwargs,
-    ):
-        """Compute the reduction.
-
-        Parameters
-        ----------
-        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
-            Input data.
-
-        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
-            Input data.
-
-        **kwargs : additional parameters for the reduction
-
-        Notes
-        -----
-        This method is an abstract class method: it has to be implemented
-        for all subclasses.
-        """
-
-class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
-    """Compute the argkmin of row vectors of X on the ones of Y.
-
-    For each row vector of X, computes the indices of k first the rows
-    vectors of Y with the smallest distances.
-
-    PairwiseDistancesArgKmin is typically used to perform
-    bruteforce k-nearest neighbors queries.
-
-    This class is not meant to be instanciated, one should only use
-    its :meth:`compute` classmethod which handles allocation and
-    deallocation consistently.
-    """
-
-    @classmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        k,
-        metric="euclidean",
-        chunk_size=None,
-        metric_kwargs=None,
-        strategy=None,
-        return_distance=False,
-    ):
-        """Compute the argkmin reduction.
-
-        Parameters
-        ----------
-        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
-            Input data.
-
-        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
-            Input data.
-
-        k : int
-            The k for the argkmin reduction.
-
-        metric : str, default='euclidean'
-            The distance metric to use for argkmin.
-            For a list of available metrics, see the documentation of
-            :class:`~sklearn.metrics.DistanceMetric`.
-
-        chunk_size : int, default=None,
-            The number of vectors per chunk. If None (default) looks-up in
-            scikit-learn configuration for `pairwise_dist_chunk_size`,
-            and use 256 if it is not set.
-
-        metric_kwargs : dict, default=None
-            Keyword arguments to pass to specified metric function.
-
-        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
-            The chunking strategy defining which dataset parallelization are made on.
-
-            For both strategies the computations happens with two nested loops,
-            respectively on chunks of X and chunks of Y.
-            Strategies differs on which loop (outer or inner) is made to run
-            in parallel with the Cython `prange` construct:
-
-              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
-              Each thread then iterates on all the chunks of Y. This strategy is
-              embarrassingly parallel and comes with no datastructures synchronisation.
-
-              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
-              Each thread processes all the chunks of X in turn. This strategy is
-              a sequence of embarrassingly parallel subtasks (the inner loop on Y
-              chunks) with intermediate datastructures synchronisation at each
-              iteration of the sequential outer loop on X chunks.
-
-              - 'auto' relies on a simple heuristic to choose between
-              'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
-              'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]`
-              is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity
-              for parallelism and is therefore more efficient despite the synchronization
-              step at each iteration of the outer loop on chunks of `X`.
-
-              - None (default) looks-up in scikit-learn configuration for
-              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
-
-        return_distance : boolean, default=False
-            Return distances between each X vector and its
-            argkmin if set to True.
-
-        Returns
-        -------
-        If return_distance=False:
-          - argkmin_indices : ndarray of shape (n_samples_X, k)
-            Indices of the argkmin for each vector in X.
-
-        If return_distance=True:
-          - argkmin_distances : ndarray of shape (n_samples_X, k)
-            Distances to the argkmin for each vector in X.
-          - argkmin_indices : ndarray of shape (n_samples_X, k)
-            Indices of the argkmin for each vector in X.
-
-        Notes
-        -----
-        This classmethod is responsible for introspecting the arguments
-        values to dispatch to the most appropriate implementation of
-        :class:`PairwiseDistancesArgKmin`.
-
-        This allows decoupling the API entirely from the implementation details
-        whilst maintaining RAII: all temporarily allocated datastructures necessary
-        for the concrete implementation are therefore freed when this classmethod
-        returns.
-        """
-        # Note (jjerphan): Some design thoughts for future extensions.
-        # This factory comes to handle specialisations for the given arguments.
-        # For future work, this might can be an entrypoint to specialise operations
-        # for various backend and/or hardware and/or datatypes, and/or fused
-        # {sparse, dense}-datasetspair etc.
-        if X.dtype == Y.dtype == np.float64:
-            return PairwiseDistancesArgKmin64.compute(
-                X=X,
-                Y=Y,
-                k=k,
-                metric=metric,
-                chunk_size=chunk_size,
-                metric_kwargs=metric_kwargs,
-                strategy=strategy,
-                return_distance=return_distance,
-            )
-        raise ValueError(
-            f"Only 64bit float datasets are supported at this time, "
-            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
-        )
-
-
-class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
-    """Compute radius-based neighbors for two sets of vectors.
-
-    For each row-vector X[i] of the queries X, find all the indices j of
-    row-vectors in Y such that:
-
-                        dist(X[i], Y[j]) <= radius
-
-    The distance function `dist` depends on the values of the `metric`
-    and `metric_kwargs` parameters.
-
-    This class is not meant to be instanciated, one should only use
-    its :meth:`compute` classmethod which handles allocation and
-    deallocation consistently.
-    """
-
-    @classmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        radius,
-        metric="euclidean",
-        chunk_size=None,
-        metric_kwargs=None,
-        strategy=None,
-        return_distance=False,
-        sort_results=False,
-    ):
-        """Return the results of the reduction for the given arguments.
-
-        Parameters
-        ----------
-        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
-            Input data.
-
-        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
-            Input data.
-
-        radius : float
-            The radius defining the neighborhood.
-
-        metric : str, default='euclidean'
-            The distance metric to use.
-            For a list of available metrics, see the documentation of
-            :class:`~sklearn.metrics.DistanceMetric`.
-
-        chunk_size : int, default=None,
-            The number of vectors per chunk. If None (default) looks-up in
-            scikit-learn configuration for `pairwise_dist_chunk_size`,
-            and use 256 if it is not set.
-
-        metric_kwargs : dict, default=None
-            Keyword arguments to pass to specified metric function.
-
-        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
-            The chunking strategy defining which dataset parallelization are made on.
-
-            For both strategies the computations happens with two nested loops,
-            respectively on chunks of X and chunks of Y.
-            Strategies differs on which loop (outer or inner) is made to run
-            in parallel with the Cython `prange` construct:
-
-              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
-              Each thread then iterates on all the chunks of Y. This strategy is
-              embarrassingly parallel and comes with no datastructures synchronisation.
-
-              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
-              Each thread processes all the chunks of X in turn. This strategy is
-              a sequence of embarrassingly parallel subtasks (the inner loop on Y
-              chunks) with intermediate datastructures synchronisation at each
-              iteration of the sequential outer loop on X chunks.
-
-              - 'auto' relies on a simple heuristic to choose between
-              'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
-              'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]`
-              is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity
-              for parallelism and is therefore more efficient despite the synchronization
-              step at each iteration of the outer loop on chunks of `X`.
-
-              - None (default) looks-up in scikit-learn configuration for
-              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
-
-        return_distance : boolean, default=False
-            Return distances between each X vector and its neighbors if set to True.
-
-        sort_results : boolean, default=False
-            Sort results with respect to distances between each X vector and its
-            neighbors if set to True.
-
-        Returns
-        -------
-        If return_distance=False:
-          - neighbors_indices : ndarray of n_samples_X ndarray
-            Indices of the neighbors for each vector in X.
-
-        If return_distance=True:
-          - neighbors_indices : ndarray of n_samples_X ndarray
-            Indices of the neighbors for each vector in X.
-          - neighbors_distances : ndarray of n_samples_X ndarray
-            Distances to the neighbors for each vector in X.
-
-        Notes
-        -----
-        This public classmethod is responsible for introspecting the arguments
-        values to dispatch to the private dtype-specialized implementation of
-        :class:`PairwiseDistancesRadiusNeighborhood`.
-
-        All temporarily allocated datastructures necessary for the concrete
-        implementation are therefore freed when this classmethod returns.
-
-        This allows entirely decoupling the API entirely from the
-        implementation details whilst maintaining RAII.
-        """
-        # Note (jjerphan): Some design thoughts for future extensions.
-        # This factory comes to handle specialisations for the given arguments.
-        # For future work, this might can be an entrypoint to specialise operations
-        # for various backend and/or hardware and/or datatypes, and/or fused
-        # {sparse, dense}-datasetspair etc.
-        if X.dtype == Y.dtype == np.float64:
-            return PairwiseDistancesRadiusNeighborhood64.compute(
-                X=X,
-                Y=Y,
-                radius=radius,
-                metric=metric,
-                chunk_size=chunk_size,
-                metric_kwargs=metric_kwargs,
-                strategy=strategy,
-                sort_results=sort_results,
-                return_distance=return_distance,
-            )
-        raise ValueError(
-            f"Only 64bit float datasets are supported at this time, "
-            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
-        )
-
-#####################
-# dtype-specialized implementations
-
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
-    const DTYPE_t[:, ::1] X,
-    ITYPE_t num_threads,
-):
-    """Compute the squared euclidean norm of the rows of X in parallel.
-
-    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
-    """
-    cdef:
-        # Casting for X to remove the const qualifier is needed because APIs
-        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
-        # const qualifier.
-        # See: https://github.com/scipy/scipy/issues/14262
-        DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
-        ITYPE_t i = 0
-        ITYPE_t n = X.shape[0]
-        ITYPE_t d = X.shape[1]
-        DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
-
-    for i in prange(n, schedule='static', nogil=True, num_threads=num_threads):
-        squared_row_norms[i] = _dot(d, X_ptr + i * d, 1, X_ptr + i * d, 1)
-
-    return squared_row_norms
-
-cdef class GEMMTermComputer64:
-    """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM.
-
-    `FastEuclidean*` classes internally compute the squared Euclidean distances between
-    chunks of vectors X_c and Y_c using the following decomposition:
-
-
-                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-
-
-    This helper class is in charge of wrapping the common logic to compute
-    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
-    arithmetic intensity.
-    """
-    cdef:
-        const DTYPE_t[:, ::1] X
-        const DTYPE_t[:, ::1] Y
-
-        ITYPE_t effective_n_threads
-        ITYPE_t chunks_n_threads
-        ITYPE_t dist_middle_terms_chunks_size
-        ITYPE_t n_features
-        ITYPE_t chunk_size
-
-        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
-        vector[vector[DTYPE_t]] dist_middle_terms_chunks
-
-    def __init__(self,
-        DTYPE_t[:, ::1] X,
-        DTYPE_t[:, ::1] Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        self.X = X
-        self.Y = Y
-        self.effective_n_threads = effective_n_threads
-        self.chunks_n_threads = chunks_n_threads
-        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
-        self.n_features = n_features
-        self.chunk_size = chunk_size
-
-        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
-
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
-        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_init(self) nogil:
-        for thread_num in range(self.chunks_n_threads):
-            self.dist_middle_terms_chunks[thread_num].resize(
-                self.dist_middle_terms_chunks_size
-            )
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil:
-        return
-
-    cdef DTYPE_t * _compute_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t squared_dist_i_j
-            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
-            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
-
-            # Careful: LDA, LDB and LDC are given for F-ordered arrays
-            # in BLAS documentations, for instance:
-            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
-            #
-            # Here, we use their counterpart values to work with C-ordered arrays.
-            BLAS_Order order = RowMajor
-            BLAS_Trans ta = NoTrans
-            BLAS_Trans tb = Trans
-            ITYPE_t m = X_c.shape[0]
-            ITYPE_t n = Y_c.shape[0]
-            ITYPE_t K = X_c.shape[1]
-            DTYPE_t alpha = - 2.
-            # Casting for A and B to remove the const is needed because APIs exposed via
-            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
-            # See: https://github.com/scipy/scipy/issues/14262
-            DTYPE_t * A = <DTYPE_t *> &X_c[0, 0]
-            DTYPE_t * B = <DTYPE_t *> &Y_c[0, 0]
-            ITYPE_t lda = X_c.shape[1]
-            ITYPE_t ldb = X_c.shape[1]
-            DTYPE_t beta = 0.
-            ITYPE_t ldc = Y_c.shape[0]
-
-        # dist_middle_terms = `-2 * X_c @ Y_c.T`
-        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
-
-        return dist_middle_terms
-
-cdef class PairwiseDistancesReduction64:
-    """Base 64bit implementation of PairwiseDistancesReduction."""
-
-    cdef:
-        readonly DatasetsPair datasets_pair
-
-        # The number of threads that can be used is stored in effective_n_threads.
-        #
-        # The number of threads to use in the parallelization strategy
-        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
-        # for small datasets, fewer threads might be needed to loop over pair of chunks.
-        #
-        # Hence, the number of threads that _will_ be used for looping over chunks
-        # is stored in chunks_n_threads, allowing solely using what we need.
-        #
-        # Thus, an invariant is:
-        #
-        #                 chunks_n_threads <= effective_n_threads
-        #
-        ITYPE_t effective_n_threads
-        ITYPE_t chunks_n_threads
-
-        ITYPE_t n_samples_chunk, chunk_size
-
-        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
-        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
-
-        bint execute_in_parallel_on_Y
-
-    def __init__(
-        self,
-        DatasetsPair datasets_pair,
-        chunk_size=None,
-        strategy=None,
-     ):
-        cdef:
-            ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks
-
-        if chunk_size is None:
-            chunk_size = get_config().get("pairwise_dist_chunk_size", 256)
-
-        self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
-
-        self.effective_n_threads = _openmp_effective_n_threads()
-
-        self.datasets_pair = datasets_pair
-
-        self.n_samples_X = datasets_pair.n_samples_X()
-        self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size)
-        X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
-        X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
-        self.X_n_chunks = X_n_full_chunks + (X_n_samples_remainder != 0)
-
-        if X_n_samples_remainder != 0:
-            self.X_n_samples_last_chunk = X_n_samples_remainder
-        else:
-            self.X_n_samples_last_chunk = self.X_n_samples_chunk
-
-        self.n_samples_Y = datasets_pair.n_samples_Y()
-        self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size)
-        Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
-        Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
-        self.Y_n_chunks = Y_n_full_chunks + (Y_n_samples_remainder != 0)
-
-        if Y_n_samples_remainder != 0:
-            self.Y_n_samples_last_chunk = Y_n_samples_remainder
-        else:
-            self.Y_n_samples_last_chunk = self.Y_n_samples_chunk
-
-        if strategy is None:
-            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
-
-        if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'):
-            raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
-                               f"or 'auto', but currently strategy='{self.strategy}'.")
-
-        if strategy == 'auto':
-            # This is a simple heuristic whose constant for the
-            # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X:
-                strategy = 'parallel_on_X'
-            else:
-                strategy = 'parallel_on_Y'
-
-        self.execute_in_parallel_on_Y = strategy == "parallel_on_Y"
-
-        # Not using less, not using more.
-        self.chunks_n_threads = min(
-            self.Y_n_chunks if self.execute_in_parallel_on_Y else self.X_n_chunks,
-            self.effective_n_threads,
-        )
-
-    @final
-    cdef void _parallel_on_X(self) nogil:
-        """Compute the pairwise distances of each row vector of X on Y
-        by parallelizing computation on the outer loop on chunks of X
-        and reduce them.
-
-        This strategy dispatches chunks of Y uniformly on threads.
-        Each thread processes all the chunks of X in turn. This strategy is
-        a sequence of embarrassingly parallel subtasks (the inner loop on Y
-        chunks) with intermediate datastructures synchronisation at each
-        iteration of the sequential outer loop on X chunks.
-
-        Private datastructures are modified internally by threads.
-
-        Private template methods can be implemented on subclasses to
-        interact with those datastructures at various stages.
-        """
-        cdef:
-            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
-            ITYPE_t thread_num
-
-        with nogil, parallel(num_threads=self.chunks_n_threads):
-            thread_num = _openmp_thread_num()
-
-            # Allocating thread datastructures
-            self._parallel_on_X_parallel_init(thread_num)
-
-            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
-                X_start = X_chunk_idx * self.X_n_samples_chunk
-                if X_chunk_idx == self.X_n_chunks - 1:
-                    X_end = X_start + self.X_n_samples_last_chunk
-                else:
-                    X_end = X_start + self.X_n_samples_chunk
-
-                # Reinitializing thread datastructures for the new X chunk
-                # If necessary, upcast X[X_start:X_end] to 64bit
-                self._parallel_on_X_init_chunk(thread_num, X_start, X_end)
-
-                for Y_chunk_idx in range(self.Y_n_chunks):
-                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
-                    if Y_chunk_idx == self.Y_n_chunks - 1:
-                        Y_end = Y_start + self.Y_n_samples_last_chunk
-                    else:
-                        Y_end = Y_start + self.Y_n_samples_chunk
-
-                    # If necessary, upcast Y[Y_start:Y_end] to 64bit
-                    self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-                        X_start, X_end,
-                        Y_start, Y_end,
-                        thread_num,
-                    )
-
-                    self._compute_and_reduce_distances_on_chunks(
-                        X_start, X_end,
-                        Y_start, Y_end,
-                        thread_num,
-                    )
-
-                # Adjusting thread datastructures on the full pass on Y
-                self._parallel_on_X_prange_iter_finalize(thread_num, X_start, X_end)
-
-            # end: for X_chunk_idx
-
-            # Deallocating thread datastructures
-            self._parallel_on_X_parallel_finalize(thread_num)
-
-        # end: with nogil, parallel
-        return
-
-    @final
-    cdef void _parallel_on_Y(self) nogil:
-        """Compute the pairwise distances of each row vector of X on Y
-        by parallelizing computation on the inner loop on chunks of Y
-        and reduce them.
-
-        This strategy dispatches chunks of Y uniformly on threads.
-        Each thread processes all the chunks of X in turn. This strategy is
-        a sequence of embarrassingly parallel subtasks (the inner loop on Y
-        chunks) with intermediate datastructures synchronisation at each
-        iteration of the sequential outer loop on X chunks.
-
-        Private datastructures are modified internally by threads.
-
-        Private template methods can be implemented on subclasses to
-        interact with those datastructures at various stages.
-        """
-        cdef:
-            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
-            ITYPE_t thread_num
-
-        # Allocating datastructures shared by all threads
-        self._parallel_on_Y_init()
-
-        for X_chunk_idx in range(self.X_n_chunks):
-            X_start = X_chunk_idx * self.X_n_samples_chunk
-            if X_chunk_idx == self.X_n_chunks - 1:
-                X_end = X_start + self.X_n_samples_last_chunk
-            else:
-                X_end = X_start + self.X_n_samples_chunk
-
-            with nogil, parallel(num_threads=self.chunks_n_threads):
-                thread_num = _openmp_thread_num()
-
-                # Initializing datastructures used in this thread
-                # If necessary, upcast X[X_start:X_end] to 64bit
-                self._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
-
-                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
-                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
-                    if Y_chunk_idx == self.Y_n_chunks - 1:
-                        Y_end = Y_start + self.Y_n_samples_last_chunk
-                    else:
-                        Y_end = Y_start + self.Y_n_samples_chunk
-
-                    # If necessary, upcast Y[Y_start:Y_end] to 64bit
-                    self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-                        X_start, X_end,
-                        Y_start, Y_end,
-                        thread_num,
-                    )
-
-                    self._compute_and_reduce_distances_on_chunks(
-                        X_start, X_end,
-                        Y_start, Y_end,
-                        thread_num,
-                    )
-                # end: prange
-
-                # Note: we don't need a _parallel_on_Y_finalize similarly.
-                # This can be introduced if needed.
-
-            # end: with nogil, parallel
-
-            # Synchronizing the thread datastructures with the main ones
-            self._parallel_on_Y_synchronize(X_start, X_end)
-
-        # end: for X_chunk_idx
-        # Deallocating temporary datastructures and adjusting main datastructures
-        self._parallel_on_Y_finalize()
-        return
-
-    # Placeholder methods which have to be implemented
-
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        """Compute the pairwise distances on two chunks of X and Y and reduce them.
-
-        This is THE core computational method of PairwiseDistanceReductions64.
-        This must be implemented in subclasses agnostically from the parallelization
-        strategies.
-        """
-        return
-
-    def _finalize_results(self, bint return_distance):
-        """Callback adapting datastructures before returning results.
-
-        This must be implemented in subclasses.
-        """
-        return None
-
-    # Placeholder methods which can be implemented
-
-    cdef void compute_exact_distances(self) nogil:
-        """Convert rank-preserving distances to exact distances or recompute them."""
-        return
-
-    cdef void _parallel_on_X_parallel_init(
-        self,
-        ITYPE_t thread_num,
-    ) nogil:
-        """Allocate datastructures used in a thread given its number."""
-        return
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        """Initialise datastructures used in a thread given its number."""
-        return
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks.
-
-        This is eventually used to upcast X[X_start:X_end] to 64bit.
-        """
-        return
-
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        """Interact with datastructures after a reduction on chunks."""
-        return
-
-    cdef void _parallel_on_X_parallel_finalize(
-        self,
-        ITYPE_t thread_num
-    ) nogil:
-        """Interact with datastructures after executing all the reductions."""
-        return
-
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil:
-        """Allocate datastructures used in all threads."""
-        return
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        """Initialise datastructures used in a thread given its number."""
-        return
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks.
-
-        This is eventually used to upcast Y[Y_start:Y_end] to 64bit.
-        """
-        return
-
-    cdef void _parallel_on_Y_synchronize(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        """Update thread datastructures before leaving a parallel region."""
-        return
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        """Update datastructures after executing all the reductions."""
-        return
-
-cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64):
-    """64bit implementation of PairwiseDistancesArgKmin."""
-
-    cdef:
-        ITYPE_t k
-
-        ITYPE_t[:, ::1] argkmin_indices
-        DTYPE_t[:, ::1] argkmin_distances
-
-        # Used as array of pointers to private datastructures used in threads.
-        DTYPE_t ** heaps_r_distances_chunks
-        ITYPE_t ** heaps_indices_chunks
-
-    @classmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        ITYPE_t k,
-        str metric="euclidean",
-        chunk_size=None,
-        dict metric_kwargs=None,
-        str strategy=None,
-        bint return_distance=False,
-    ):
-        """Compute the argkmin reduction.
-
-        This classmethod is responsible for introspecting the arguments
-        values to dispatch to the most appropriate implementation of
-        :class:`PairwiseDistancesArgKmin64`.
-
-        This allows decoupling the API entirely from the implementation details
-        whilst maintaining RAII: all temporarily allocated datastructures necessary
-        for the concrete implementation are therefore freed when this classmethod
-        returns.
-
-        No instance should directly be created outside of this class method.
-        """
-        if (
-            metric in ("euclidean", "sqeuclidean")
-            and not issparse(X)
-            and not issparse(Y)
-        ):
-            # Specialized implementation with improved arithmetic intensity
-            # and vector instructions (SIMD) by processing several vectors
-            # at time to leverage a call to the BLAS GEMM routine as explained
-            # in more details in the docstring.
-            use_squared_distances = metric == "sqeuclidean"
-            pda = FastEuclideanPairwiseDistancesArgKmin64(
-                X=X, Y=Y, k=k,
-                use_squared_distances=use_squared_distances,
-                chunk_size=chunk_size,
-                strategy=strategy,
-                metric_kwargs=metric_kwargs,
-            )
-        else:
-             # Fall back on a generic implementation that handles most scipy
-             # metrics by computing the distances between 2 vectors at a time.
-            pda = PairwiseDistancesArgKmin64(
-                datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
-                k=k,
-                chunk_size=chunk_size,
-                strategy=strategy,
-            )
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if pda.execute_in_parallel_on_Y:
-                pda._parallel_on_Y()
-            else:
-                pda._parallel_on_X()
-
-        return pda._finalize_results(return_distance)
-
-    def __init__(
-        self,
-        DatasetsPair datasets_pair,
-        chunk_size=None,
-        strategy=None,
-        ITYPE_t k=1,
-    ):
-        super().__init__(
-            datasets_pair=datasets_pair,
-            chunk_size=chunk_size,
-            strategy=strategy,
-        )
-        self.k = check_scalar(k, "k", Integral, min_val=1)
-
-        # Allocating pointers to datastructures but not the datastructures themselves.
-        # There are as many pointers as effective threads.
-        #
-        # For the sake of explicitness:
-        #   - when parallelizing on X, the pointers of those heaps are referencing
-        #   (with proper offsets) addresses of the two main heaps (see below)
-        #   - when parallelizing on Y, the pointers of those heaps are referencing
-        #   small heaps which are thread-wise-allocated and whose content will be
-        #   merged with the main heaps'.
-        self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
-            sizeof(DTYPE_t *) * self.chunks_n_threads
-        )
-        self.heaps_indices_chunks = <ITYPE_t **> malloc(
-            sizeof(ITYPE_t *) * self.chunks_n_threads
-        )
-
-        # Main heaps which will be returned as results by `PairwiseDistancesArgKmin64.compute`.
-        self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
-        self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
-
-    def __dealloc__(self):
-        if self.heaps_indices_chunks is not NULL:
-            free(self.heaps_indices_chunks)
-
-        if self.heaps_r_distances_chunks is not NULL:
-            free(self.heaps_r_distances_chunks)
-
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_samples_X = X_end - X_start
-            ITYPE_t n_samples_Y = Y_end - Y_start
-            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
-            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
-
-        # Pushing the distances and their associated indices on a heap
-        # which by construction will keep track of the argkmin.
-        for i in range(n_samples_X):
-            for j in range(n_samples_Y):
-                heap_push(
-                    heaps_r_distances + i * self.k,
-                    heaps_indices + i * self.k,
-                    self.k,
-                    self.datasets_pair.surrogate_dist(X_start + i, Y_start + j),
-                    Y_start + j,
-                )
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        # As this strategy is embarrassingly parallel, we can set each
-        # thread's heaps pointer to the proper position on the main heaps.
-        self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
-        self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
-
-    @final
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx
-
-        # Sorting the main heaps portion associated to `X[X_start:X_end]`
-        # in ascending order w.r.t the distances.
-        for idx in range(X_end - X_start):
-            simultaneous_sort(
-                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
-                self.heaps_indices_chunks[thread_num] + idx * self.k,
-                self.k
-            )
-
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil:
-        cdef:
-            # Maximum number of scalar elements (the last chunks can be smaller)
-            ITYPE_t heaps_size = self.X_n_samples_chunk * self.k
-            ITYPE_t thread_num
-
-        # The allocation is done in parallel for data locality purposes: this way
-        # the heaps used in each threads are allocated in pages which are closer
-        # to the CPU core used by the thread.
-        # See comments about First Touch Placement Policy:
-        # https://www.openmp.org/wp-content/uploads/openmp-webinar-vanderPas-20210318.pdf #noqa
-        for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True,
-                                 num_threads=self.chunks_n_threads):
-            # As chunks of X are shared across threads, so must their
-            # heaps. To solve this, each thread has its own heaps
-            # which are then synchronised back in the main ones.
-            self.heaps_r_distances_chunks[thread_num] = <DTYPE_t *> malloc(
-                heaps_size * sizeof(DTYPE_t)
-            )
-            self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(
-                heaps_size * sizeof(ITYPE_t)
-            )
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        # Initialising heaps (memset can't be used here)
-        for idx in range(self.X_n_samples_chunk * self.k):
-            self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX
-            self.heaps_indices_chunks[thread_num][idx] = -1
-
-    @final
-    cdef void _parallel_on_Y_synchronize(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx, thread_num
-        with nogil, parallel(num_threads=self.effective_n_threads):
-            # Synchronising the thread heaps with the main heaps.
-            # This is done in parallel sample-wise (no need for locks).
-            #
-            # This might break each thread's data locality as each heap which
-            # was allocated in a thread is being now being used in several threads.
-            #
-            # Still, this parallel pattern has shown to be efficient in practice.
-            for idx in prange(X_end - X_start, schedule="static"):
-                for thread_num in range(self.chunks_n_threads):
-                    for jdx in range(self.k):
-                        heap_push(
-                            &self.argkmin_distances[X_start + idx, 0],
-                            &self.argkmin_indices[X_start + idx, 0],
-                            self.k,
-                            self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx],
-                            self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
-                        )
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, thread_num
-
-        with nogil, parallel(num_threads=self.chunks_n_threads):
-            # Deallocating temporary datastructures
-            for thread_num in prange(self.chunks_n_threads, schedule='static'):
-                free(self.heaps_r_distances_chunks[thread_num])
-                free(self.heaps_indices_chunks[thread_num])
-
-            # Sorting the main in ascending order w.r.t the distances.
-            # This is done in parallel sample-wise (no need for locks).
-            for idx in prange(self.n_samples_X, schedule='static'):
-                simultaneous_sort(
-                    &self.argkmin_distances[idx, 0],
-                    &self.argkmin_indices[idx, 0],
-                    self.k,
-                )
-        return
-
-    cdef void compute_exact_distances(self) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
-            DTYPE_t[:, ::1] distances = self.argkmin_distances
-        for i in prange(self.n_samples_X, schedule='static', nogil=True,
-                        num_threads=self.effective_n_threads):
-            for j in range(self.k):
-                distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(
-                    # Guard against eventual -0., causing nan production.
-                    max(distances[i, j], 0.)
-                )
-
-    def _finalize_results(self, bint return_distance=False):
-        if return_distance:
-            # We need to recompute distances because we relied on
-            # surrogate distances for the reduction.
-            self.compute_exact_distances()
-
-            # Values are returned identically to the way `KNeighborsMixin.kneighbors`
-            # returns values. This is counter-intuitive but this allows not using
-            # complex adaptations where `PairwiseDistancesArgKmin64.compute` is called.
-            return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
-
-        return np.asarray(self.argkmin_indices)
-
-
-cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64):
-    """EuclideanDistance-specialized 64bit implementation for PairwiseDistancesArgKmin."""
-    cdef:
-        GEMMTermComputer64 gemm_term_computer
-        const DTYPE_t[::1] X_norm_squared
-        const DTYPE_t[::1] Y_norm_squared
-
-        bint use_squared_distances
-
-    @classmethod
-    def is_usable_for(cls, X, Y, metric) -> bool:
-        return (PairwiseDistancesArgKmin64.is_usable_for(X, Y, metric) and
-                not _in_unstable_openblas_configuration())
-
-    def __init__(
-        self,
-        X,
-        Y,
-        ITYPE_t k,
-        bint use_squared_distances=False,
-        chunk_size=None,
-        strategy=None,
-        metric_kwargs=None,
-    ):
-        if (
-            metric_kwargs is not None and
-            len(metric_kwargs) > 0 and
-            "Y_norm_squared" not in metric_kwargs
-        ):
-            warnings.warn(
-                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
-                f"usable for this case (FastEuclideanPairwiseDistancesArgKmin) and will be ignored.",
-                UserWarning,
-                stacklevel=3,
-            )
-
-        super().__init__(
-            # The datasets pair here is used for exact distances computations
-            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
-            chunk_size=chunk_size,
-            strategy=strategy,
-            k=k,
-        )
-        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
-        cdef:
-            DenseDenseDatasetsPair datasets_pair = (
-            <DenseDenseDatasetsPair> self.datasets_pair
-        )
-            ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
-
-        self.gemm_term_computer = GEMMTermComputer64(
-            datasets_pair.X,
-            datasets_pair.Y,
-            self.effective_n_threads,
-            self.chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features=datasets_pair.X.shape[1],
-            chunk_size=self.chunk_size,
-        )
-
-        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
-            self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared")
-        else:
-            self.Y_norm_squared = _sqeuclidean_row_norms64(datasets_pair.Y, self.effective_n_threads)
-
-        # Do not recompute norms if datasets are identical.
-        self.X_norm_squared = (
-            self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms64(datasets_pair.X, self.effective_n_threads)
-        )
-        self.use_squared_distances = use_squared_distances
-
-    @final
-    cdef void compute_exact_distances(self) nogil:
-        if not self.use_squared_distances:
-            PairwiseDistancesArgKmin64.compute_exact_distances(self)
-
-    @final
-    cdef void _parallel_on_X_parallel_init(
-        self,
-        ITYPE_t thread_num,
-    ) nogil:
-        PairwiseDistancesArgKmin64._parallel_on_X_parallel_init(self, thread_num)
-        self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
-
-
-    @final
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        PairwiseDistancesArgKmin64._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
-
-
-    @final
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        PairwiseDistancesArgKmin64._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-            self,
-            X_start, X_end,
-            Y_start, Y_end,
-            thread_num,
-        )
-        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-            X_start, X_end, Y_start, Y_end, thread_num,
-        )
-
-
-    @final
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil:
-        cdef ITYPE_t thread_num
-        PairwiseDistancesArgKmin64._parallel_on_Y_init(self)
-        self.gemm_term_computer._parallel_on_Y_init()
-
-
-    @final
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        PairwiseDistancesArgKmin64._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
-
-
-    @final
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        PairwiseDistancesArgKmin64._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-            self,
-            X_start, X_end,
-            Y_start, Y_end,
-            thread_num,
-        )
-        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-            X_start, X_end, Y_start, Y_end, thread_num
-        )
-
-
-    @final
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t squared_dist_i_j
-            ITYPE_t n_X = X_end - X_start
-            ITYPE_t n_Y = Y_end - Y_start
-            DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks(
-                X_start, X_end, Y_start, Y_end, thread_num
-            )
-            DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
-            ITYPE_t * heaps_indices = self.heaps_indices_chunks[thread_num]
-
-
-        # Pushing the distance and their associated indices on heaps
-        # which keep tracks of the argkmin.
-        for i in range(n_X):
-            for j in range(n_Y):
-                heap_push(
-                    heaps_r_distances + i * self.k,
-                    heaps_indices + i * self.k,
-                    self.k,
-                    # Using the squared euclidean distance as the rank-preserving distance:
-                    #
-                    #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                    #
-                    (
-                        self.X_norm_squared[i + X_start] +
-                        dist_middle_terms[i * n_Y + j] +
-                        self.Y_norm_squared[j + Y_start]
-                    ),
-                    j + Y_start,
-                )
-
-
-cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64):
-    """64bit implementation of PairwiseDistancesArgKmin."""
-
-    cdef:
-        DTYPE_t radius
-
-        # DistanceMetric compute rank-preserving surrogate distance via rdist
-        # which are proxies necessitating less computations.
-        # We get the equivalent for the radius to be able to compare it against
-        # vectors' rank-preserving surrogate distances.
-        DTYPE_t r_radius
-
-        # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays.
-        #
-        # For this implementation, we want resizable buffers which we will wrap
-        # into numpy arrays at the end. std::vector comes as a handy container
-        # for interacting efficiently with resizable buffers.
-        #
-        # Though it is possible to access their buffer address with
-        # std::vector::data, they can't be stolen: buffers lifetime
-        # is tied to their std::vector and are deallocated when
-        # std::vectors are.
-        #
-        # To solve this, we dynamically allocate std::vectors and then
-        # encapsulate them in a StdVectorSentinel responsible for
-        # freeing them when the associated np.ndarray is freed.
-        #
-        # Shared pointers (defined via shared_ptr) are use for safer memory management.
-        # Unique pointers (defined via unique_ptr) can't be used as datastructures
-        # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk.
-        shared_ptr[vector[vector[ITYPE_t]]] neigh_indices
-        shared_ptr[vector[vector[DTYPE_t]]] neigh_distances
-
-        # Used as array of pointers to private datastructures used in threads.
-        vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks
-        vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks
-
-        bint sort_results
-
-    @classmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        DTYPE_t radius,
-        str metric="euclidean",
-        chunk_size=None,
-        dict metric_kwargs=None,
-        str strategy=None,
-        bint return_distance=False,
-        bint sort_results=False,
-    ):
-        """Compute the radius-neighbors reduction.
-
-        This classmethod is responsible for introspecting the arguments
-        values to dispatch to the most appropriate implementation of
-        :class:`PairwiseDistancesRadiusNeighborhood64`.
-
-        This allows decoupling the API entirely from the implementation details
-        whilst maintaining RAII: all temporarily allocated datastructures necessary
-        for the concrete implementation are therefore freed when this classmethod
-        returns.
-
-        No instance should directly be created outside of this class method.
-        """
-        if (
-            metric in ("euclidean", "sqeuclidean")
-            and not issparse(X)
-            and not issparse(Y)
-        ):
-            # Specialized implementation with improved arithmetic intensity
-            # and vector instructions (SIMD) by processing several vectors
-            # at time to leverage a call to the BLAS GEMM routine as explained
-            # in more details in the docstring.
-            use_squared_distances = metric == "sqeuclidean"
-            pda = FastEuclideanPairwiseDistancesRadiusNeighborhood64(
-                X=X, Y=Y, radius=radius,
-                use_squared_distances=use_squared_distances,
-                chunk_size=chunk_size,
-                metric_kwargs=metric_kwargs,
-                strategy=strategy,
-                sort_results=sort_results,
-            )
-        else:
-             # Fall back on a generic implementation that handles most scipy
-             # metrics by computing the distances between 2 vectors at a time.
-            pda = PairwiseDistancesRadiusNeighborhood64(
-                datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
-                radius=radius,
-                chunk_size=chunk_size,
-                metric_kwargs=metric_kwargs,
-                strategy=strategy,
-                sort_results=sort_results,
-            )
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if pda.execute_in_parallel_on_Y:
-                pda._parallel_on_Y()
-            else:
-                pda._parallel_on_X()
-
-        return pda._finalize_results(return_distance)
-
-
-    def __init__(
-        self,
-        DatasetsPair datasets_pair,
-        DTYPE_t radius,
-        chunk_size=None,
-        strategy=None,
-        sort_results=False,
-        metric_kwargs=None,
-    ):
-        super().__init__(
-            datasets_pair=datasets_pair,
-            chunk_size=chunk_size,
-            strategy=strategy,
-        )
-
-        self.radius = check_scalar(radius, "radius", Real, min_val=0)
-        self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
-        self.sort_results = sort_results
-
-        # Allocating pointers to datastructures but not the datastructures themselves.
-        # There are as many pointers as effective threads.
-        #
-        # For the sake of explicitness:
-        #   - when parallelizing on X, the pointers of those heaps are referencing
-        #   self.neigh_distances and self.neigh_indices
-        #   - when parallelizing on Y, the pointers of those heaps are referencing
-        #   std::vectors of std::vectors which are thread-wise-allocated and whose
-        #   content will be merged into self.neigh_distances and self.neigh_indices.
-        self.neigh_distances_chunks = vector[shared_ptr[vector[vector[DTYPE_t]]]](
-            self.chunks_n_threads
-        )
-        self.neigh_indices_chunks = vector[shared_ptr[vector[vector[ITYPE_t]]]](
-            self.chunks_n_threads
-        )
-
-        # Temporary datastructures which will be coerced to numpy arrays on before
-        # PairwiseDistancesRadiusNeighborhood.compute "return" and will be then freed.
-        self.neigh_distances = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X)
-        self.neigh_indices = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X)
-
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t r_dist_i_j
-
-        for i in range(X_start, X_end):
-            for j in range(Y_start, Y_end):
-                r_dist_i_j = self.datasets_pair.surrogate_dist(i, j)
-                if r_dist_i_j <= self.r_radius:
-                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
-                    deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
-
-    def _finalize_results(self, bint return_distance=False):
-        if return_distance:
-            # We need to recompute distances because we relied on
-            # surrogate distances for the reduction.
-            self.compute_exact_distances()
-            return (
-                coerce_vectors_to_nd_arrays(self.neigh_distances),
-                coerce_vectors_to_nd_arrays(self.neigh_indices),
-            )
-
-        return coerce_vectors_to_nd_arrays(self.neigh_indices)
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-
-        # As this strategy is embarrassingly parallel, we can set the
-        # thread vectors' pointers to the main vectors'.
-        self.neigh_distances_chunks[thread_num] = self.neigh_distances
-        self.neigh_indices_chunks[thread_num] = self.neigh_indices
-
-    @final
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx
-
-        # Sorting neighbors for each query vector of X
-        if self.sort_results:
-            for idx in range(X_start, X_end):
-                simultaneous_sort(
-                    deref(self.neigh_distances)[idx].data(),
-                    deref(self.neigh_indices)[idx].data(),
-                    deref(self.neigh_indices)[idx].size()
-                )
-
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t thread_num
-        # As chunks of X are shared across threads, so must datastructures to avoid race
-        # conditions: each thread has its own vectors of n_samples_X vectors which are
-        # then merged back in the main n_samples_X vectors.
-        for thread_num in range(self.chunks_n_threads):
-            self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X)
-            self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X)
-
-    @final
-    cdef void _merge_vectors(
-        self,
-        ITYPE_t idx,
-        ITYPE_t num_threads,
-    ) nogil:
-        cdef:
-            ITYPE_t thread_num
-            ITYPE_t idx_n_elements = 0
-            ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size()
-
-        # Resizing buffers only once for the given number of elements.
-        for thread_num in range(num_threads):
-            idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
-
-        deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
-        deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
-
-        # Moving the elements by range using the range first element
-        # as the reference for the insertion.
-        for thread_num in range(num_threads):
-            move(
-                deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
-                deref(self.neigh_distances_chunks[thread_num])[idx].end(),
-                deref(self.neigh_distances)[idx].begin() + last_element_idx
-            )
-            move(
-                deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
-                deref(self.neigh_indices_chunks[thread_num])[idx].end(),
-                deref(self.neigh_indices)[idx].begin() + last_element_idx
-            )
-            last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
-
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
-
-        with nogil, parallel(num_threads=self.effective_n_threads):
-            # Merge vectors used in threads into the main ones.
-            # This is done in parallel sample-wise (no need for locks)
-            # using dynamic scheduling because we might not have
-            # the same number of neighbors for each query vector.
-            for idx in prange(self.n_samples_X, schedule='static'):
-                self._merge_vectors(idx, self.chunks_n_threads)
-
-            # The content of the vector have been std::moved.
-            # Hence they can't be used anymore and can be deleted.
-            # Their deletion is carried out automatically as the
-            # implementation relies on shared pointers.
-
-            # Sort in parallel in ascending order w.r.t the distances if requested.
-            if self.sort_results:
-                for idx in prange(self.n_samples_X, schedule='static'):
-                    simultaneous_sort(
-                        deref(self.neigh_distances)[idx].data(),
-                        deref(self.neigh_indices)[idx].data(),
-                        deref(self.neigh_indices)[idx].size()
-                    )
-
-        return
-
-    cdef void compute_exact_distances(self) nogil:
-        """Convert rank-preserving distances to pairwise distances in parallel."""
-        cdef:
-            ITYPE_t i, j
-
-        for i in prange(self.n_samples_X, nogil=True, schedule='static',
-                        num_threads=self.effective_n_threads):
-            for j in range(deref(self.neigh_indices)[i].size()):
-                deref(self.neigh_distances)[i][j] = (
-                        self.datasets_pair.distance_metric._rdist_to_dist(
-                            # Guard against eventual -0., causing nan production.
-                            max(deref(self.neigh_distances)[i][j], 0.)
-                        )
-                )
-
-
-cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesRadiusNeighborhood64):
-    """EuclideanDistance-specialized 64bit implementation for PairwiseDistancesRadiusNeighborhood."""
-    cdef:
-        GEMMTermComputer64 gemm_term_computer
-        const DTYPE_t[::1] X_norm_squared
-        const DTYPE_t[::1] Y_norm_squared
-
-        bint use_squared_distances
-
-    @classmethod
-    def is_usable_for(cls, X, Y, metric) -> bool:
-        return (PairwiseDistancesRadiusNeighborhood64.is_usable_for(X, Y, metric)
-                and not _in_unstable_openblas_configuration())
-
-    def __init__(
-        self,
-        X,
-        Y,
-        DTYPE_t radius,
-        bint use_squared_distances=False,
-        chunk_size=None,
-        strategy=None,
-        sort_results=False,
-        metric_kwargs=None,
-    ):
-        if (
-            metric_kwargs is not None and
-            len(metric_kwargs) > 0 and
-            "Y_norm_squared" not in metric_kwargs
-        ):
-            warnings.warn(
-                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
-                f"usable for this case (FastEuclideanPairwiseDistancesRadiusNeighborhood) and will be ignored.",
-                UserWarning,
-                stacklevel=3,
-            )
-
-        super().__init__(
-            # The datasets pair here is used for exact distances computations
-            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
-            radius=radius,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            sort_results=sort_results,
-            metric_kwargs=metric_kwargs,
-        )
-        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
-        cdef:
-            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
-            ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
-
-        self.gemm_term_computer = GEMMTermComputer64(
-            datasets_pair.X,
-            datasets_pair.Y,
-            self.effective_n_threads,
-            self.chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features=datasets_pair.X.shape[1],
-            chunk_size=self.chunk_size,
-        )
-
-        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
-            self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared")
-        else:
-            self.Y_norm_squared = _sqeuclidean_row_norms64(datasets_pair.Y, self.effective_n_threads)
-
-        # Do not recompute norms if datasets are identical.
-        self.X_norm_squared = (
-            self.Y_norm_squared if X is Y else
-            _sqeuclidean_row_norms64(datasets_pair.X, self.effective_n_threads)
-        )
-        self.use_squared_distances = use_squared_distances
-
-        if use_squared_distances:
-            # In this specialisation and this setup, the value passed to the radius is
-            # already considered to be the adapted radius, so we overwrite it.
-            self.r_radius = radius
-
-    @final
-    cdef void _parallel_on_X_parallel_init(
-        self,
-        ITYPE_t thread_num,
-    ) nogil:
-        PairwiseDistancesRadiusNeighborhood64._parallel_on_X_parallel_init(self, thread_num)
-        self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
-
-    @final
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        PairwiseDistancesRadiusNeighborhood64._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
-
-    @final
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        PairwiseDistancesRadiusNeighborhood64._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-            self,
-            X_start, X_end,
-            Y_start, Y_end,
-            thread_num,
-        )
-        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-            X_start, X_end, Y_start, Y_end, thread_num,
-        )
-
-    @final
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil:
-        cdef ITYPE_t thread_num
-        PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_init(self)
-        self.gemm_term_computer._parallel_on_Y_init()
-
-    @final
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
-        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
-
-    @final
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-            self,
-            X_start, X_end,
-            Y_start, Y_end,
-            thread_num,
-        )
-        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-            X_start, X_end, Y_start, Y_end, thread_num
-        )
-
-    @final
-    cdef void compute_exact_distances(self) nogil:
-        if not self.use_squared_distances:
-            PairwiseDistancesRadiusNeighborhood64.compute_exact_distances(self)
-
-    @final
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            DTYPE_t squared_dist_i_j
-            ITYPE_t n_X = X_end - X_start
-            ITYPE_t n_Y = Y_end - Y_start
-            DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks(
-                X_start, X_end, Y_start, Y_end, thread_num
-            )
-
-        # Pushing the distance and their associated indices in vectors.
-        for i in range(n_X):
-            for j in range(n_Y):
-                # Using the squared euclidean distance as the rank-preserving distance:
-                #
-                #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                #
-                squared_dist_i_j = (
-                    self.X_norm_squared[i + X_start]
-                    + dist_middle_terms[i * n_Y + j]
-                    + self.Y_norm_squared[j + Y_start]
-                )
-                if squared_dist_i_j <= self.r_radius:
-                    deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
-                    deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
diff --git a/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/sklearn/metrics/_pairwise_distances_reduction/__init__.py
new file mode 100644
index 0000000000000..d420060ca78df
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/__init__.py
@@ -0,0 +1,101 @@
+# Pairwise Distances Reductions
+# =============================
+#
+#    Author: Julien Jerphanion <git@jjerphan.xyz>
+#
+# Overview
+# --------
+#
+#    This module provides routines to compute pairwise distances between a set
+#    of row vectors of X and another set of row vectors of Y and apply a
+#    reduction on top. The canonical example is the brute-force computation
+#    of the top k nearest neighbors by leveraging the arg-k-min reduction.
+#
+#    The reduction takes a matrix of pairwise distances between rows of X and Y
+#    as input and outputs an aggregate data-structure for each row of X. The
+#    aggregate values are typically smaller than the number of rows in Y, hence
+#    the term reduction.
+#
+#    For computational reasons, the reduction are performed on the fly on chunks
+#    of rows of X and Y so as to keep intermediate data-structures in CPU cache
+#    and avoid unnecessary round trips of large distance arrays with the RAM
+#    that would otherwise severely degrade the speed by making the overall
+#    processing memory-bound.
+#
+#    Finally, the routines follow a generic parallelization template to process
+#    chunks of data with OpenMP loops (via Cython prange), either on rows of X
+#    or rows of Y depending on their respective sizes.
+#
+#
+# Dispatching to specialized implementations
+# ------------------------------------------
+#
+#    Dispatchers are meant to be used in the Python code. Under the hood, a
+#    dispatcher must only define the logic to choose at runtime to the correct
+#    dtype-specialized :class:`PairwiseDistancesReduction` implementation based
+#    on the dtype of X and of Y.
+#
+#
+# High-level diagram
+# ------------------
+#
+#    Legend:
+#
+#      A ---⊳ B: A inherits from B
+#      A ---x B: A dispatches to B
+#
+#
+#                               (base dispatcher)
+#                           PairwiseDistancesReduction
+#                                       ∆
+#                                       |
+#                                       |
+#                     +-----------------+-----------------+
+#                     |                                   |
+#               (dispatcher)                        (dispatcher)
+#         PairwiseDistancesArgKmin           PairwiseDistancesRadiusNeighbors
+#               |                                              |
+#               |                                              |
+#               |                                              |
+#               |                  (64bit implem.)             |
+#               |          PairwiseDistancesReduction64        |
+#               |                       ∆                      |
+#               |                       |                      |
+#               |                       |                      |
+#               |     +-----------------+-----------------+    |
+#               |     |                                   |    |
+#               |     |                                   |    |
+#               x     |                                   |    x
+#        PairwiseDistancesArgKmin64       PairwiseDistancesRadiusNeighbors64
+#               |     ∆                                   ∆    |
+#               |     |                                   |    |
+#               x     |                                   |    |
+#     FastEuclideanPairwiseDistancesArgKmin64             |    |
+#                                                         |    |
+#                                                         |    x
+#                                  FastEuclideanPairwiseDistancesRadiusNeighbors64
+#
+#    For instance :class:`PairwiseDistancesArgKmin`, dispatches to
+#    :class:`PairwiseDistancesArgKmin64` if X and Y are both dense NumPy arrays
+#    with a float64 dtype.
+#
+#    In addition, if the metric parameter is set to "euclidean" or "sqeuclidean",
+#    :class:`PairwiseDistancesArgKmin64` further dispatches to
+#    :class:`FastEuclideanPairwiseDistancesArgKmin64` a specialized subclass
+#    to optimally handle the Euclidean distance case using the Generalized Matrix
+#    Multiplication (see the docstring of :class:`GEMMTermComputer64` for details).
+
+
+from ._dispatcher import (
+    PairwiseDistancesReduction,
+    PairwiseDistancesArgKmin,
+    PairwiseDistancesRadiusNeighborhood,
+    sqeuclidean_row_norms,
+)
+
+__all__ = [
+    "PairwiseDistancesReduction",
+    "PairwiseDistancesArgKmin",
+    "PairwiseDistancesRadiusNeighborhood",
+    "sqeuclidean_row_norms",
+]
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd
new file mode 100644
index 0000000000000..34d3339e1c9e0
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd
@@ -0,0 +1,33 @@
+cimport numpy as cnp
+
+from ._base cimport (
+    PairwiseDistancesReduction64,
+)
+from ._gemm_term_computer cimport GEMMTermComputer64
+
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+
+cnp.import_array()
+
+cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64):
+    """64bit implementation of PairwiseDistancesArgKmin."""
+
+    cdef:
+        ITYPE_t k
+
+        ITYPE_t[:, ::1] argkmin_indices
+        DTYPE_t[:, ::1] argkmin_distances
+
+        # Used as array of pointers to private datastructures used in threads.
+        DTYPE_t ** heaps_r_distances_chunks
+        ITYPE_t ** heaps_indices_chunks
+
+
+cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64):
+    """EuclideanDistance-specialized 64bit implementation for PairwiseDistancesArgKmin."""
+    cdef:
+        GEMMTermComputer64 gemm_term_computer
+        const DTYPE_t[::1] X_norm_squared
+        const DTYPE_t[::1] Y_norm_squared
+
+        bint use_squared_distances
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx
new file mode 100644
index 0000000000000..2f378543e1f97
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx
@@ -0,0 +1,501 @@
+cimport numpy as cnp
+
+from libc.stdlib cimport free, malloc
+from libc.float cimport DBL_MAX
+from cython cimport final
+from cython.parallel cimport parallel, prange
+
+from ._base cimport (
+    PairwiseDistancesReduction64,
+    _sqeuclidean_row_norms64,
+)
+
+from ._datasets_pair cimport (
+    DatasetsPair,
+    DenseDenseDatasetsPair,
+)
+
+from ._gemm_term_computer cimport GEMMTermComputer64
+
+from ...utils._heap cimport heap_push
+from ...utils._sorting cimport simultaneous_sort
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+
+import numpy as np
+import warnings
+
+from numbers import Integral
+from scipy.sparse import issparse
+from sklearn.utils import check_scalar, _in_unstable_openblas_configuration
+from sklearn.utils.fixes import threadpool_limits
+from ...utils._typedefs import ITYPE, DTYPE
+
+cnp.import_array()
+
+
+cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64):
+    """64bit implementation of PairwiseDistancesArgKmin."""
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        ITYPE_t k,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+        bint return_distance=False,
+    ):
+        """Compute the argkmin reduction.
+
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`PairwiseDistancesArgKmin64`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+
+        No instance should directly be created outside of this class method.
+        """
+        if (
+            metric in ("euclidean", "sqeuclidean")
+            and not issparse(X)
+            and not issparse(Y)
+        ):
+            # Specialized implementation with improved arithmetic intensity
+            # and vector instructions (SIMD) by processing several vectors
+            # at time to leverage a call to the BLAS GEMM routine as explained
+            # in more details in the docstring.
+            use_squared_distances = metric == "sqeuclidean"
+            pda = FastEuclideanPairwiseDistancesArgKmin64(
+                X=X, Y=Y, k=k,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size,
+                strategy=strategy,
+                metric_kwargs=metric_kwargs,
+            )
+        else:
+             # Fall back on a generic implementation that handles most scipy
+             # metrics by computing the distances between 2 vectors at a time.
+            pda = PairwiseDistancesArgKmin64(
+                datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+                k=k,
+                chunk_size=chunk_size,
+                strategy=strategy,
+            )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results(return_distance)
+
+    def __init__(
+        self,
+        DatasetsPair datasets_pair,
+        chunk_size=None,
+        strategy=None,
+        ITYPE_t k=1,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+        )
+        self.k = check_scalar(k, "k", Integral, min_val=1)
+
+        # Allocating pointers to datastructures but not the datastructures themselves.
+        # There are as many pointers as effective threads.
+        #
+        # For the sake of explicitness:
+        #   - when parallelizing on X, the pointers of those heaps are referencing
+        #   (with proper offsets) addresses of the two main heaps (see below)
+        #   - when parallelizing on Y, the pointers of those heaps are referencing
+        #   small heaps which are thread-wise-allocated and whose content will be
+        #   merged with the main heaps'.
+        self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.chunks_n_threads
+        )
+        self.heaps_indices_chunks = <ITYPE_t **> malloc(
+            sizeof(ITYPE_t *) * self.chunks_n_threads
+        )
+
+        # Main heaps which will be returned as results by `PairwiseDistancesArgKmin64.compute`.
+        self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
+        self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
+
+    def __dealloc__(self):
+        if self.heaps_indices_chunks is not NULL:
+            free(self.heaps_indices_chunks)
+
+        if self.heaps_r_distances_chunks is not NULL:
+            free(self.heaps_r_distances_chunks)
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_samples_X = X_end - X_start
+            ITYPE_t n_samples_Y = Y_end - Y_start
+            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
+
+        # Pushing the distances and their associated indices on a heap
+        # which by construction will keep track of the argkmin.
+        for i in range(n_samples_X):
+            for j in range(n_samples_Y):
+                heap_push(
+                    heaps_r_distances + i * self.k,
+                    heaps_indices + i * self.k,
+                    self.k,
+                    self.datasets_pair.surrogate_dist(X_start + i, Y_start + j),
+                    Y_start + j,
+                )
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        # As this strategy is embarrassingly parallel, we can set each
+        # thread's heaps pointer to the proper position on the main heaps.
+        self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
+        self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
+
+    @final
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx
+
+        # Sorting the main heaps portion associated to `X[X_start:X_end]`
+        # in ascending order w.r.t the distances.
+        for idx in range(X_end - X_start):
+            simultaneous_sort(
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
+                self.k
+            )
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) nogil:
+        cdef:
+            # Maximum number of scalar elements (the last chunks can be smaller)
+            ITYPE_t heaps_size = self.X_n_samples_chunk * self.k
+            ITYPE_t thread_num
+
+        # The allocation is done in parallel for data locality purposes: this way
+        # the heaps used in each threads are allocated in pages which are closer
+        # to the CPU core used by the thread.
+        # See comments about First Touch Placement Policy:
+        # https://www.openmp.org/wp-content/uploads/openmp-webinar-vanderPas-20210318.pdf #noqa
+        for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True,
+                                 num_threads=self.chunks_n_threads):
+            # As chunks of X are shared across threads, so must their
+            # heaps. To solve this, each thread has its own heaps
+            # which are then synchronised back in the main ones.
+            self.heaps_r_distances_chunks[thread_num] = <DTYPE_t *> malloc(
+                heaps_size * sizeof(DTYPE_t)
+            )
+            self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(
+                heaps_size * sizeof(ITYPE_t)
+            )
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        # Initialising heaps (memset can't be used here)
+        for idx in range(self.X_n_samples_chunk * self.k):
+            self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX
+            self.heaps_indices_chunks[thread_num][idx] = -1
+
+    @final
+    cdef void _parallel_on_Y_synchronize(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx, thread_num
+        with nogil, parallel(num_threads=self.effective_n_threads):
+            # Synchronising the thread heaps with the main heaps.
+            # This is done in parallel sample-wise (no need for locks).
+            #
+            # This might break each thread's data locality as each heap which
+            # was allocated in a thread is being now being used in several threads.
+            #
+            # Still, this parallel pattern has shown to be efficient in practice.
+            for idx in prange(X_end - X_start, schedule="static"):
+                for thread_num in range(self.chunks_n_threads):
+                    for jdx in range(self.k):
+                        heap_push(
+                            &self.argkmin_distances[X_start + idx, 0],
+                            &self.argkmin_indices[X_start + idx, 0],
+                            self.k,
+                            self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx],
+                            self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
+                        )
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            # Deallocating temporary datastructures
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sorting the main in ascending order w.r.t the distances.
+            # This is done in parallel sample-wise (no need for locks).
+            for idx in prange(self.n_samples_X, schedule='static'):
+                simultaneous_sort(
+                    &self.argkmin_distances[idx, 0],
+                    &self.argkmin_indices[idx, 0],
+                    self.k,
+                )
+        return
+
+    cdef void compute_exact_distances(self) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
+            DTYPE_t[:, ::1] distances = self.argkmin_distances
+        for i in prange(self.n_samples_X, schedule='static', nogil=True,
+                        num_threads=self.effective_n_threads):
+            for j in range(self.k):
+                distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(
+                    # Guard against eventual -0., causing nan production.
+                    max(distances[i, j], 0.)
+                )
+
+    def _finalize_results(self, bint return_distance=False):
+        if return_distance:
+            # We need to recompute distances because we relied on
+            # surrogate distances for the reduction.
+            self.compute_exact_distances()
+
+            # Values are returned identically to the way `KNeighborsMixin.kneighbors`
+            # returns values. This is counter-intuitive but this allows not using
+            # complex adaptations where `PairwiseDistancesArgKmin64.compute` is called.
+            return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
+
+        return np.asarray(self.argkmin_indices)
+
+
+cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64):
+    """EuclideanDistance-specialized 64bit implementation for PairwiseDistancesArgKmin."""
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return (PairwiseDistancesArgKmin64.is_usable_for(X, Y, metric) and
+                not _in_unstable_openblas_configuration())
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t k,
+        bint use_squared_distances=False,
+        chunk_size=None,
+        strategy=None,
+        metric_kwargs=None,
+    ):
+        if (
+            metric_kwargs is not None and
+            len(metric_kwargs) > 0 and
+            "Y_norm_squared" not in metric_kwargs
+        ):
+            warnings.warn(
+                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
+                f"usable for this case (FastEuclideanPairwiseDistancesArgKmin) and will be ignored.",
+                UserWarning,
+                stacklevel=3,
+            )
+
+        super().__init__(
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+            chunk_size=chunk_size,
+            strategy=strategy,
+            k=k,
+        )
+        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        cdef:
+            DenseDenseDatasetsPair datasets_pair = (
+            <DenseDenseDatasetsPair> self.datasets_pair
+        )
+            ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
+
+        self.gemm_term_computer = GEMMTermComputer64(
+            datasets_pair.X,
+            datasets_pair.Y,
+            self.effective_n_threads,
+            self.chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features=datasets_pair.X.shape[1],
+            chunk_size=self.chunk_size,
+        )
+
+        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
+            self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared")
+        else:
+            self.Y_norm_squared = _sqeuclidean_row_norms64(datasets_pair.Y, self.effective_n_threads)
+
+        # Do not recompute norms if datasets are identical.
+        self.X_norm_squared = (
+            self.Y_norm_squared if X is Y else
+            _sqeuclidean_row_norms64(datasets_pair.X, self.effective_n_threads)
+        )
+        self.use_squared_distances = use_squared_distances
+
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        if not self.use_squared_distances:
+            PairwiseDistancesArgKmin64.compute_exact_distances(self)
+
+    @final
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesArgKmin64._parallel_on_X_parallel_init(self, thread_num)
+        self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
+
+
+    @final
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        PairwiseDistancesArgKmin64._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+
+
+    @final
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesArgKmin64._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num,
+        )
+
+
+    @final
+    cdef void _parallel_on_Y_init(
+        self,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        PairwiseDistancesArgKmin64._parallel_on_Y_init(self)
+        self.gemm_term_computer._parallel_on_Y_init()
+
+
+    @final
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        PairwiseDistancesArgKmin64._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+
+
+    @final
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesArgKmin64._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num
+        )
+
+
+    @final
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t squared_dist_i_j
+            ITYPE_t n_X = X_end - X_start
+            ITYPE_t n_Y = Y_end - Y_start
+            DTYPE_t * dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks(
+                X_start, X_end, Y_start, Y_end, thread_num
+            )
+            DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            ITYPE_t * heaps_indices = self.heaps_indices_chunks[thread_num]
+
+
+        # Pushing the distance and their associated indices on heaps
+        # which keep tracks of the argkmin.
+        for i in range(n_X):
+            for j in range(n_Y):
+                heap_push(
+                    heaps_r_distances + i * self.k,
+                    heaps_indices + i * self.k,
+                    self.k,
+                    # Using the squared euclidean distance as the rank-preserving distance:
+                    #
+                    #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                    #
+                    (
+                        self.X_norm_squared[i + X_start] +
+                        dist_middle_terms[i * n_Y + j] +
+                        self.Y_norm_squared[j + Y_start]
+                    ),
+                    j + Y_start,
+                )
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd
new file mode 100644
index 0000000000000..9f6ad45cb839a
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd
@@ -0,0 +1,128 @@
+cimport numpy as cnp
+
+from cython cimport final
+
+from ._datasets_pair cimport DatasetsPair
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+
+cnp.import_array()
+
+
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
+    const DTYPE_t[:, ::1] X,
+    ITYPE_t num_threads,
+)
+
+cdef class PairwiseDistancesReduction64:
+    """Base 64bit implementation of PairwiseDistancesReduction."""
+
+    cdef:
+        readonly DatasetsPair datasets_pair
+
+        # The number of threads that can be used is stored in effective_n_threads.
+        #
+        # The number of threads to use in the parallelization strategy
+        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
+        # for small datasets, fewer threads might be needed to loop over pair of chunks.
+        #
+        # Hence, the number of threads that _will_ be used for looping over chunks
+        # is stored in chunks_n_threads, allowing solely using what we need.
+        #
+        # Thus, an invariant is:
+        #
+        #                 chunks_n_threads <= effective_n_threads
+        #
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+
+        ITYPE_t n_samples_chunk, chunk_size
+
+        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
+        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
+
+        bint execute_in_parallel_on_Y
+
+    @final
+    cdef void _parallel_on_X(self) nogil
+
+    @final
+    cdef void _parallel_on_Y(self) nogil
+
+    # Placeholder methods which have to be implemented
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+
+    # Placeholder methods which can be implemented
+
+    cdef void compute_exact_distances(self) nogil
+
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_Y_synchronize(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx
new file mode 100644
index 0000000000000..07506e3616a74
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx
@@ -0,0 +1,372 @@
+cimport numpy as cnp
+import numpy as np
+
+from sklearn import get_config
+from cython cimport final
+from cython.parallel cimport parallel, prange
+
+from ._datasets_pair cimport DatasetsPair
+from ...utils._cython_blas cimport _dot
+from ...utils._openmp_helpers cimport _openmp_thread_num
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+
+from numbers import Integral
+from sklearn.utils import check_scalar
+from ...utils._openmp_helpers import _openmp_effective_n_threads
+from ...utils._typedefs import ITYPE, DTYPE
+
+cnp.import_array()
+
+#####################
+
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms64(
+    const DTYPE_t[:, ::1] X,
+    ITYPE_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        # See: https://github.com/scipy/scipy/issues/14262
+        DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
+        ITYPE_t idx = 0
+        ITYPE_t n = X.shape[0]
+        ITYPE_t d = X.shape[1]
+        DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
+
+    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
+
+    return squared_row_norms
+
+
+cdef class PairwiseDistancesReduction64:
+    """Base 64bit implementation of PairwiseDistancesReduction."""
+
+    def __init__(
+        self,
+        DatasetsPair datasets_pair,
+        chunk_size=None,
+        strategy=None,
+     ):
+        cdef:
+            ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks
+
+        if chunk_size is None:
+            chunk_size = get_config().get("pairwise_dist_chunk_size", 256)
+
+        self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
+
+        self.effective_n_threads = _openmp_effective_n_threads()
+
+        self.datasets_pair = datasets_pair
+
+        self.n_samples_X = datasets_pair.n_samples_X()
+        self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size)
+        X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
+        X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
+        self.X_n_chunks = X_n_full_chunks + (X_n_samples_remainder != 0)
+
+        if X_n_samples_remainder != 0:
+            self.X_n_samples_last_chunk = X_n_samples_remainder
+        else:
+            self.X_n_samples_last_chunk = self.X_n_samples_chunk
+
+        self.n_samples_Y = datasets_pair.n_samples_Y()
+        self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size)
+        Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
+        Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
+        self.Y_n_chunks = Y_n_full_chunks + (Y_n_samples_remainder != 0)
+
+        if Y_n_samples_remainder != 0:
+            self.Y_n_samples_last_chunk = Y_n_samples_remainder
+        else:
+            self.Y_n_samples_last_chunk = self.Y_n_samples_chunk
+
+        if strategy is None:
+            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+
+        if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'):
+            raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
+                               f"or 'auto', but currently strategy='{self.strategy}'.")
+
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            if 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X:
+                strategy = 'parallel_on_X'
+            else:
+                strategy = 'parallel_on_Y'
+
+        self.execute_in_parallel_on_Y = strategy == "parallel_on_Y"
+
+        # Not using less, not using more.
+        self.chunks_n_threads = min(
+            self.Y_n_chunks if self.execute_in_parallel_on_Y else self.X_n_chunks,
+            self.effective_n_threads,
+        )
+
+    @final
+    cdef void _parallel_on_X(self) nogil:
+        """Compute the pairwise distances of each row vector of X on Y
+        by parallelizing computation on the outer loop on chunks of X
+        and reduce them.
+
+        This strategy dispatches chunks of Y uniformly on threads.
+        Each thread processes all the chunks of X in turn. This strategy is
+        a sequence of embarrassingly parallel subtasks (the inner loop on Y
+        chunks) with intermediate datastructures synchronisation at each
+        iteration of the sequential outer loop on X chunks.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            ITYPE_t thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            thread_num = _openmp_thread_num()
+
+            # Allocating thread datastructures
+            self._parallel_on_X_parallel_init(thread_num)
+
+            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
+                X_start = X_chunk_idx * self.X_n_samples_chunk
+                if X_chunk_idx == self.X_n_chunks - 1:
+                    X_end = X_start + self.X_n_samples_last_chunk
+                else:
+                    X_end = X_start + self.X_n_samples_chunk
+
+                # Reinitializing thread datastructures for the new X chunk
+                # If necessary, upcast X[X_start:X_end] to 64bit
+                self._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+
+                for Y_chunk_idx in range(self.Y_n_chunks):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1:
+                        Y_end = Y_start + self.Y_n_samples_last_chunk
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    # If necessary, upcast Y[Y_start:Y_end] to 64bit
+                    self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
+                    self._compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
+                # Adjusting thread datastructures on the full pass on Y
+                self._parallel_on_X_prange_iter_finalize(thread_num, X_start, X_end)
+
+            # end: for X_chunk_idx
+
+            # Deallocating thread datastructures
+            self._parallel_on_X_parallel_finalize(thread_num)
+
+        # end: with nogil, parallel
+        return
+
+    @final
+    cdef void _parallel_on_Y(self) nogil:
+        """Compute the pairwise distances of each row vector of X on Y
+        by parallelizing computation on the inner loop on chunks of Y
+        and reduce them.
+
+        This strategy dispatches chunks of Y uniformly on threads.
+        Each thread processes all the chunks of X in turn. This strategy is
+        a sequence of embarrassingly parallel subtasks (the inner loop on Y
+        chunks) with intermediate datastructures synchronisation at each
+        iteration of the sequential outer loop on X chunks.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            ITYPE_t thread_num
+
+        # Allocating datastructures shared by all threads
+        self._parallel_on_Y_init()
+
+        for X_chunk_idx in range(self.X_n_chunks):
+            X_start = X_chunk_idx * self.X_n_samples_chunk
+            if X_chunk_idx == self.X_n_chunks - 1:
+                X_end = X_start + self.X_n_samples_last_chunk
+            else:
+                X_end = X_start + self.X_n_samples_chunk
+
+            with nogil, parallel(num_threads=self.chunks_n_threads):
+                thread_num = _openmp_thread_num()
+
+                # Initializing datastructures used in this thread
+                # If necessary, upcast X[X_start:X_end] to 64bit
+                self._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+
+                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1:
+                        Y_end = Y_start + self.Y_n_samples_last_chunk
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    # If necessary, upcast Y[Y_start:Y_end] to 64bit
+                    self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
+                    self._compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+                # end: prange
+
+                # Note: we don't need a _parallel_on_Y_finalize similarly.
+                # This can be introduced if needed.
+
+            # end: with nogil, parallel
+
+            # Synchronizing the thread datastructures with the main ones
+            self._parallel_on_Y_synchronize(X_start, X_end)
+
+        # end: for X_chunk_idx
+        # Deallocating temporary datastructures and adjusting main datastructures
+        self._parallel_on_Y_finalize()
+        return
+
+    # Placeholder methods which have to be implemented
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Compute the pairwise distances on two chunks of X and Y and reduce them.
+
+        This is THE core computational method of PairwiseDistanceReductions64.
+        This must be implemented in subclasses agnostically from the parallelization
+        strategies.
+        """
+        return
+
+    def _finalize_results(self, bint return_distance):
+        """Callback adapting datastructures before returning results.
+
+        This must be implemented in subclasses.
+        """
+        return None
+
+    # Placeholder methods which can be implemented
+
+    cdef void compute_exact_distances(self) nogil:
+        """Convert rank-preserving distances to exact distances or recompute them."""
+        return
+
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Allocate datastructures used in a thread given its number."""
+        return
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        """Initialise datastructures used in a thread given its number."""
+        return
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks.
+
+        This is eventually used to upcast X[X_start:X_end] to 64bit.
+        """
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        """Interact with datastructures after a reduction on chunks."""
+        return
+
+    cdef void _parallel_on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
+    ) nogil:
+        """Interact with datastructures after executing all the reductions."""
+        return
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) nogil:
+        """Allocate datastructures used in all threads."""
+        return
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        """Initialise datastructures used in a thread given its number."""
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks.
+
+        This is eventually used to upcast Y[Y_start:Y_end] to 64bit.
+        """
+        return
+
+    cdef void _parallel_on_Y_synchronize(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        """Update thread datastructures before leaving a parallel region."""
+        return
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        """Update datastructures after executing all the reductions."""
+        return
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
new file mode 100644
index 0000000000000..de6458f8c6f26
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
@@ -0,0 +1,21 @@
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t
+from ...metrics._dist_metrics cimport DistanceMetric
+
+
+cdef class DatasetsPair:
+    cdef DistanceMetric distance_metric
+
+    cdef ITYPE_t n_samples_X(self) nogil
+
+    cdef ITYPE_t n_samples_Y(self) nogil
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+
+cdef class DenseDenseDatasetsPair(DatasetsPair):
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+        ITYPE_t d
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
new file mode 100644
index 0000000000000..abef1bed098ed
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
@@ -0,0 +1,164 @@
+import numpy as np
+cimport numpy as cnp
+
+from cython cimport final
+from scipy.sparse import issparse
+
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t
+from ...metrics._dist_metrics cimport DistanceMetric
+
+cnp.import_array()
+
+cdef class DatasetsPair:
+    """Abstract class which wraps a pair of datasets (X, Y).
+
+    This class allows computing distances between a single pair of rows of
+    of X and Y at a time given the pair of their indices (i, j). This class is
+    specialized for each metric thanks to the :func:`get_for` factory classmethod.
+
+    The handling of parallelization over chunks to compute the distances
+    and aggregation for several rows at a time is done in dedicated
+    subclasses of PairwiseDistancesReduction that in-turn rely on
+    subclasses of DatasetsPair for each pair of rows in the data. The goal
+    is to make it possible to decouple the generic parallelization and
+    aggregation logic from metric-specific computation as much as
+    possible.
+
+    X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
+    in subclasses.
+
+    This class avoids the overhead of dispatching distance computations
+    to :class:`sklearn.metrics.DistanceMetric` based on the physical
+    representation of the vectors (sparse vs. dense). It makes use of
+    cython.final to remove the overhead of dispatching method calls.
+
+    Parameters
+    ----------
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        str metric="euclidean",
+        dict metric_kwargs=None,
+    ) -> DatasetsPair:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        metric : str, default='euclidean'
+            The distance metric to compute between rows of X and Y.
+            The default metric is a fast implementation of the Euclidean
+            metric. For a list of available metrics, see the documentation
+            of :class:`~sklearn.metrics.DistanceMetric`.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        Returns
+        -------
+        datasets_pair: DatasetsPair
+            The suited DatasetsPair implementation.
+        """
+        cdef:
+            DistanceMetric distance_metric = DistanceMetric.get_metric(
+                metric,
+                **(metric_kwargs or {})
+            )
+
+        if not(X.dtype == Y.dtype == np.float64):
+            raise ValueError(
+                f"Only 64bit float datasets are supported at this time, "
+                f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+            )
+
+        # Metric-specific checks that do not replace nor duplicate `check_array`.
+        distance_metric._validate_data(X)
+        distance_metric._validate_data(Y)
+
+        # TODO: dispatch to other dataset pairs for sparse support once available:
+        if issparse(X) or issparse(Y):
+            raise ValueError("Only dense datasets are supported for X and Y.")
+
+        return DenseDenseDatasetsPair(X, Y, distance_metric)
+
+    def __init__(self, DistanceMetric distance_metric):
+        self.distance_metric = distance_metric
+
+    cdef ITYPE_t n_samples_X(self) nogil:
+        """Number of samples in X."""
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -999
+
+    cdef ITYPE_t n_samples_Y(self) nogil:
+        """Number of samples in Y."""
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -999
+
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.dist(i, j)
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
+        return -1
+
+@final
+cdef class DenseDenseDatasetsPair(DatasetsPair):
+    """Compute distances between row vectors of two arrays.
+
+    Parameters
+    ----------
+    X: ndarray of shape (n_samples_X, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    Y: ndarray of shape (n_samples_Y, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two row vectors of (X, Y).
+    """
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        super().__init__(distance_metric)
+        # Arrays have already been checked
+        self.X = X
+        self.Y = Y
+        self.d = X.shape[1]
+
+    @final
+    cdef ITYPE_t n_samples_X(self) nogil:
+        return self.X.shape[0]
+
+    @final
+    cdef ITYPE_t n_samples_Y(self) nogil:
+        return self.Y.shape[0]
+
+    @final
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.distance_metric.rdist(&self.X[i, 0], &self.Y[j, 0], self.d)
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.distance_metric.dist(&self.X[i, 0], &self.Y[j, 0], self.d)
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
new file mode 100644
index 0000000000000..a79fde694a9ed
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -0,0 +1,385 @@
+from abc import abstractmethod
+
+import numpy as np
+
+from typing import List
+from scipy.sparse import issparse
+from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING
+
+from ._base import _sqeuclidean_row_norms64
+from ._argkmin import PairwiseDistancesArgKmin64
+from ._radius_neighborhood import PairwiseDistancesRadiusNeighborhood64
+
+from ... import get_config
+
+
+def sqeuclidean_row_norms(X, num_threads):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        Input data. Must be c-contiguous.
+
+    num_threads : int
+        The number of OpenMP threads to use.
+
+    Returns
+    -------
+    sqeuclidean_row_norms : ndarray of shape (n_samples,)
+        Arrays containing the squared euclidean norm of each row of X.
+    """
+    if X.dtype == np.float64:
+        return _sqeuclidean_row_norms64(X, num_threads)
+    raise ValueError(
+        f"Only 64bit float datasets are supported at this time, got: X.dtype={X.dtype}."
+    )
+
+
+class PairwiseDistancesReduction:
+    """Abstract base dispatcher for pairwise distance computation & reduction.
+
+    Each dispatcher extending the base :class:`PairwiseDistancesReduction`
+    dispatcher must implement the :meth:`compute` classmethod.
+    """
+
+    @classmethod
+    def valid_metrics(cls) -> List[str]:
+        excluded = {
+            "pyfunc",  # is relatively slow because we need to coerce data as np arrays
+            "mahalanobis",  # is numerically unstable
+            # TODO: In order to support discrete distance metrics, we need to have a
+            # stable simultaneous sort which preserves the order of the input.
+            # The best might be using std::stable_sort and a Comparator taking an
+            # Arrays of Structures instead of Structure of Arrays (currently used).
+            "hamming",
+            *BOOL_METRICS,
+        }
+        return sorted(set(METRIC_MAPPING.keys()) - excluded)
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        """Return True if the PairwiseDistancesReduction can be used for the
+        given parameters.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        Returns
+        -------
+        True if the PairwiseDistancesReduction can be used, else False.
+        """
+        dtypes_validity = X.dtype == Y.dtype == np.float64
+        return (
+            get_config().get("enable_cython_pairwise_dist", True)
+            and not issparse(X)
+            and not issparse(Y)
+            and dtypes_validity
+            and metric in cls.valid_metrics()
+        )
+
+    @classmethod
+    @abstractmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        **kwargs,
+    ):
+        """Compute the reduction.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        **kwargs : additional parameters for the reduction
+
+        Notes
+        -----
+        This method is an abstract class method: it has to be implemented
+        for all subclasses.
+        """
+
+
+class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
+    """Compute the argkmin of row vectors of X on the ones of Y.
+
+    For each row vector of X, computes the indices of k first the rows
+    vectors of Y with the smallest distances.
+
+    PairwiseDistancesArgKmin is typically used to perform
+    bruteforce k-nearest neighbors queries.
+
+    This class is not meant to be instanciated, one should only use
+    its :meth:`compute` classmethod which handles allocation and
+    deallocation consistently.
+    """
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        k,
+        metric="euclidean",
+        chunk_size=None,
+        metric_kwargs=None,
+        strategy=None,
+        return_distance=False,
+    ):
+        """Compute the argkmin reduction.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        k : int
+            The k for the argkmin reduction.
+
+        metric : str, default='euclidean'
+            The distance metric to use for argkmin.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                Each thread then iterates on all the chunks of Y. This strategy is
+                embarrassingly parallel and comes with no datastructures
+                synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                Each thread processes all the chunks of X in turn. This strategy is
+                a sequence of embarrassingly parallel subtasks (the inner loop on Y
+                chunks) with intermediate datastructures synchronisation at each
+                iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+                'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+                'parallel_on_X' is usually the most efficient strategy.
+                When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
+                brings more opportunity for parallelism and is therefore more efficient
+                despite the synchronization step at each iteration of the outer loop
+                on chunks of `X`.
+
+              - None (default) looks-up in scikit-learn configuration for
+                `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its
+            argkmin if set to True.
+
+        Returns
+        -------
+        If return_distance=False:
+          - argkmin_indices : ndarray of shape (n_samples_X, k)
+            Indices of the argkmin for each vector in X.
+
+        If return_distance=True:
+          - argkmin_distances : ndarray of shape (n_samples_X, k)
+            Distances to the argkmin for each vector in X.
+          - argkmin_indices : ndarray of shape (n_samples_X, k)
+            Indices of the argkmin for each vector in X.
+
+        Notes
+        -----
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`PairwiseDistancesArgKmin`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+        """
+        # Note (jjerphan): Some design thoughts for future extensions.
+        # This factory comes to handle specialisations for the given arguments.
+        # For future work, this might can be an entrypoint to specialise operations
+        # for various backend and/or hardware and/or datatypes, and/or fused
+        # {sparse, dense}-datasetspair etc.
+        if X.dtype == Y.dtype == np.float64:
+            return PairwiseDistancesArgKmin64.compute(
+                X=X,
+                Y=Y,
+                k=k,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                return_distance=return_distance,
+            )
+        raise ValueError(
+            "Only 64bit float datasets are supported at this time, "
+            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
+
+
+class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
+    """Compute radius-based neighbors for two sets of vectors.
+
+    For each row-vector X[i] of the queries X, find all the indices j of
+    row-vectors in Y such that:
+
+                        dist(X[i], Y[j]) <= radius
+
+    The distance function `dist` depends on the values of the `metric`
+    and `metric_kwargs` parameters.
+
+    This class is not meant to be instanciated, one should only use
+    its :meth:`compute` classmethod which handles allocation and
+    deallocation consistently.
+    """
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        radius,
+        metric="euclidean",
+        chunk_size=None,
+        metric_kwargs=None,
+        strategy=None,
+        return_distance=False,
+        sort_results=False,
+    ):
+        """Return the results of the reduction for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        radius : float
+            The radius defining the neighborhood.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                Each thread then iterates on all the chunks of Y. This strategy is
+                embarrassingly parallel and comes with no datastructures
+                synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                Each thread processes all the chunks of X in turn. This strategy is
+                a sequence of embarrassingly parallel subtasks (the inner loop on Y
+                chunks) with intermediate datastructures synchronisation at each
+                iteration of the sequential outer loop on X chunks.
+
+              - 'auto' relies on a simple heuristic to choose between
+                'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+                'parallel_on_X' is usually the most efficient strategy.
+                When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
+                brings more opportunity for parallelism and is therefore more efficient
+                despite the synchronization step at each iteration of the outer loop
+                on chunks of `X`.
+
+              - None (default) looks-up in scikit-learn configuration for
+                `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its neighbors if set to True.
+
+        sort_results : boolean, default=False
+            Sort results with respect to distances between each X vector and its
+            neighbors if set to True.
+
+        Returns
+        -------
+        If return_distance=False:
+          - neighbors_indices : ndarray of n_samples_X ndarray
+            Indices of the neighbors for each vector in X.
+
+        If return_distance=True:
+          - neighbors_indices : ndarray of n_samples_X ndarray
+            Indices of the neighbors for each vector in X.
+          - neighbors_distances : ndarray of n_samples_X ndarray
+            Distances to the neighbors for each vector in X.
+
+        Notes
+        -----
+        This public classmethod is responsible for introspecting the arguments
+        values to dispatch to the private dtype-specialized implementation of
+        :class:`PairwiseDistancesRadiusNeighborhood`.
+
+        All temporarily allocated datastructures necessary for the concrete
+        implementation are therefore freed when this classmethod returns.
+
+        This allows entirely decoupling the API entirely from the
+        implementation details whilst maintaining RAII.
+        """
+        # Note (jjerphan): Some design thoughts for future extensions.
+        # This factory comes to handle specialisations for the given arguments.
+        # For future work, this might can be an entrypoint to specialise operations
+        # for various backend and/or hardware and/or datatypes, and/or fused
+        # {sparse, dense}-datasetspair etc.
+        if X.dtype == Y.dtype == np.float64:
+            return PairwiseDistancesRadiusNeighborhood64.compute(
+                X=X,
+                Y=Y,
+                radius=radius,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                sort_results=sort_results,
+                return_distance=return_distance,
+            )
+        raise ValueError(
+            "Only 64bit float datasets are supported at this time, "
+            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
new file mode 100644
index 0000000000000..a1c5bd3a8d80c
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pxd
@@ -0,0 +1,62 @@
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t
+from libcpp.vector cimport vector
+
+
+cdef class GEMMTermComputer64:
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+        ITYPE_t dist_middle_terms_chunks_size
+        ITYPE_t n_features
+        ITYPE_t chunk_size
+
+        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
+        vector[vector[DTYPE_t]] dist_middle_terms_chunks
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_init(self) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
new file mode 100644
index 0000000000000..77d752548bb5b
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx
@@ -0,0 +1,135 @@
+from libcpp.vector cimport vector
+
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t
+
+from ...utils._cython_blas cimport (
+  BLAS_Order,
+  BLAS_Trans,
+  ColMajor,
+  NoTrans,
+  RowMajor,
+  Trans,
+  _gemm,
+)
+
+cdef class GEMMTermComputer64:
+    """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM.
+
+    `FastEuclidean*` classes internally compute the squared Euclidean distances between
+    chunks of vectors X_c and Y_c using the following decomposition:
+
+
+                ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+
+
+    This helper class is in charge of wrapping the common logic to compute
+    the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high
+    arithmetic intensity.
+    """
+
+    def __init__(self,
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        self.X = X
+        self.Y = Y
+        self.effective_n_threads = effective_n_threads
+        self.chunks_n_threads = chunks_n_threads
+        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
+        self.n_features = n_features
+        self.chunk_size = chunk_size
+
+        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
+
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
+        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_init(self) nogil:
+        for thread_num in range(self.chunks_n_threads):
+            self.dist_middle_terms_chunks[thread_num].resize(
+                self.dist_middle_terms_chunks_size
+            )
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef DTYPE_t * _compute_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t squared_dist_i_j
+            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
+
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            # See: https://github.com/scipy/scipy/issues/14262
+            DTYPE_t * A = <DTYPE_t *> &X_c[0, 0]
+            DTYPE_t * B = <DTYPE_t *> &Y_c[0, 0]
+            ITYPE_t lda = X_c.shape[1]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            ITYPE_t ldc = Y_c.shape[0]
+
+        # dist_middle_terms = `-2 * X_c @ Y_c.T`
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
+
+        return dist_middle_terms
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd
new file mode 100644
index 0000000000000..737e6888a8a55
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pxd
@@ -0,0 +1,89 @@
+cimport numpy as cnp
+
+from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
+from cython cimport final
+
+from ._base cimport (
+    PairwiseDistancesReduction64,
+)
+from ._gemm_term_computer cimport GEMMTermComputer64
+
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+
+cnp.import_array()
+
+######################
+## std::vector to np.ndarray coercion
+# As type covariance is not supported for C++ containers via Cython,
+# we need to redefine fused types.
+ctypedef fused vector_DITYPE_t:
+    vector[ITYPE_t]
+    vector[DTYPE_t]
+
+
+ctypedef fused vector_vector_DITYPE_t:
+    vector[vector[ITYPE_t]]
+    vector[vector[DTYPE_t]]
+
+cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
+    shared_ptr[vector_vector_DITYPE_t] vecs
+)
+
+#####################
+
+cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64):
+    """64bit implementation of PairwiseDistancesRadiusNeighborhood ."""
+
+    cdef:
+        DTYPE_t radius
+
+        # DistanceMetric compute rank-preserving surrogate distance via rdist
+        # which are proxies necessitating less computations.
+        # We get the equivalent for the radius to be able to compare it against
+        # vectors' rank-preserving surrogate distances.
+        DTYPE_t r_radius
+
+        # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays.
+        #
+        # For this implementation, we want resizable buffers which we will wrap
+        # into numpy arrays at the end. std::vector comes as a handy container
+        # for interacting efficiently with resizable buffers.
+        #
+        # Though it is possible to access their buffer address with
+        # std::vector::data, they can't be stolen: buffers lifetime
+        # is tied to their std::vector and are deallocated when
+        # std::vectors are.
+        #
+        # To solve this, we dynamically allocate std::vectors and then
+        # encapsulate them in a StdVectorSentinel responsible for
+        # freeing them when the associated np.ndarray is freed.
+        #
+        # Shared pointers (defined via shared_ptr) are use for safer memory management.
+        # Unique pointers (defined via unique_ptr) can't be used as datastructures
+        # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk.
+        shared_ptr[vector[vector[ITYPE_t]]] neigh_indices
+        shared_ptr[vector[vector[DTYPE_t]]] neigh_distances
+
+        # Used as array of pointers to private datastructures used in threads.
+        vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks
+        vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks
+
+        bint sort_results
+
+    @final
+    cdef void _merge_vectors(
+        self,
+        ITYPE_t idx,
+        ITYPE_t num_threads,
+    ) nogil
+
+
+cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesRadiusNeighborhood64):
+    """EuclideanDistance-specialized 64bit implementation for PairwiseDistancesRadiusNeighborhood."""
+    cdef:
+        GEMMTermComputer64 gemm_term_computer
+        const DTYPE_t[::1] X_norm_squared
+        const DTYPE_t[::1] Y_norm_squared
+
+        bint use_squared_distances
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx
new file mode 100644
index 0000000000000..db2c22e89d06d
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx
@@ -0,0 +1,503 @@
+cimport numpy as cnp
+import numpy as np
+import warnings
+
+from libcpp.memory cimport shared_ptr, make_shared
+from libcpp.vector cimport vector
+from cython cimport final
+from cython.operator cimport dereference as deref
+from cython.parallel cimport parallel, prange
+
+from ._base cimport (
+    PairwiseDistancesReduction64,
+    _sqeuclidean_row_norms64
+)
+
+from ._datasets_pair cimport (
+    DatasetsPair,
+    DenseDenseDatasetsPair,
+)
+
+from ._gemm_term_computer cimport GEMMTermComputer64
+
+from ...utils._sorting cimport simultaneous_sort
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+from ...utils._vector_sentinel cimport vector_to_nd_array
+
+from numbers import Real
+from scipy.sparse import issparse
+from sklearn.utils import check_scalar, _in_unstable_openblas_configuration
+from sklearn.utils.fixes import threadpool_limits
+
+cnp.import_array()
+
+# TODO: change for `libcpp.algorithm.move` once Cython 3 is used
+# Introduction in Cython:
+# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa
+cdef extern from "<algorithm>" namespace "std" nogil:
+    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa
+
+######################
+
+cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
+    shared_ptr[vector_vector_DITYPE_t] vecs
+):
+    """Coerce a std::vector of std::vector to a ndarray of ndarray."""
+    cdef:
+        ITYPE_t n = deref(vecs).size()
+        cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray)
+
+    for i in range(n):
+        nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
+
+    return nd_arrays_of_nd_arrays
+
+#####################
+
+
+cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64):
+    """64bit implementation of PairwiseDistancesRadiusNeighborhood."""
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        DTYPE_t radius,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+        bint return_distance=False,
+        bint sort_results=False,
+    ):
+        """Compute the radius-neighbors reduction.
+
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`PairwiseDistancesRadiusNeighborhood64`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+
+        No instance should directly be created outside of this class method.
+        """
+        if (
+            metric in ("euclidean", "sqeuclidean")
+            and not issparse(X)
+            and not issparse(Y)
+        ):
+            # Specialized implementation with improved arithmetic intensity
+            # and vector instructions (SIMD) by processing several vectors
+            # at time to leverage a call to the BLAS GEMM routine as explained
+            # in more details in GEMMTermComputer docstring.
+            use_squared_distances = metric == "sqeuclidean"
+            pda = FastEuclideanPairwiseDistancesRadiusNeighborhood64(
+                X=X, Y=Y, radius=radius,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                sort_results=sort_results,
+            )
+        else:
+             # Fall back on a generic implementation that handles most scipy
+             # metrics by computing the distances between 2 vectors at a time.
+            pda = PairwiseDistancesRadiusNeighborhood64(
+                datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+                radius=radius,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+                sort_results=sort_results,
+            )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results(return_distance)
+
+
+    def __init__(
+        self,
+        DatasetsPair datasets_pair,
+        DTYPE_t radius,
+        chunk_size=None,
+        strategy=None,
+        sort_results=False,
+        metric_kwargs=None,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+        )
+
+        self.radius = check_scalar(radius, "radius", Real, min_val=0)
+        self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius)
+        self.sort_results = sort_results
+
+        # Allocating pointers to datastructures but not the datastructures themselves.
+        # There are as many pointers as effective threads.
+        #
+        # For the sake of explicitness:
+        #   - when parallelizing on X, the pointers of those heaps are referencing
+        #   self.neigh_distances and self.neigh_indices
+        #   - when parallelizing on Y, the pointers of those heaps are referencing
+        #   std::vectors of std::vectors which are thread-wise-allocated and whose
+        #   content will be merged into self.neigh_distances and self.neigh_indices.
+        self.neigh_distances_chunks = vector[shared_ptr[vector[vector[DTYPE_t]]]](
+            self.chunks_n_threads
+        )
+        self.neigh_indices_chunks = vector[shared_ptr[vector[vector[ITYPE_t]]]](
+            self.chunks_n_threads
+        )
+
+        # Temporary datastructures which will be coerced to numpy arrays on before
+        # PairwiseDistancesRadiusNeighborhood.compute "return" and will be then freed.
+        self.neigh_distances = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X)
+        self.neigh_indices = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X)
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t r_dist_i_j
+
+        for i in range(X_start, X_end):
+            for j in range(Y_start, Y_end):
+                r_dist_i_j = self.datasets_pair.surrogate_dist(i, j)
+                if r_dist_i_j <= self.r_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
+
+    def _finalize_results(self, bint return_distance=False):
+        if return_distance:
+            # We need to recompute distances because we relied on
+            # surrogate distances for the reduction.
+            self.compute_exact_distances()
+            return (
+                coerce_vectors_to_nd_arrays(self.neigh_distances),
+                coerce_vectors_to_nd_arrays(self.neigh_indices),
+            )
+
+        return coerce_vectors_to_nd_arrays(self.neigh_indices)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+
+        # As this strategy is embarrassingly parallel, we can set the
+        # thread vectors' pointers to the main vectors'.
+        self.neigh_distances_chunks[thread_num] = self.neigh_distances
+        self.neigh_indices_chunks[thread_num] = self.neigh_indices
+
+    @final
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx
+
+        # Sorting neighbors for each query vector of X
+        if self.sort_results:
+            for idx in range(X_start, X_end):
+                simultaneous_sort(
+                    deref(self.neigh_distances)[idx].data(),
+                    deref(self.neigh_indices)[idx].data(),
+                    deref(self.neigh_indices)[idx].size()
+                )
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t thread_num
+        # As chunks of X are shared across threads, so must datastructures to avoid race
+        # conditions: each thread has its own vectors of n_samples_X vectors which are
+        # then merged back in the main n_samples_X vectors.
+        for thread_num in range(self.chunks_n_threads):
+            self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X)
+            self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X)
+
+    @final
+    cdef void _merge_vectors(
+        self,
+        ITYPE_t idx,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef:
+            ITYPE_t thread_num
+            ITYPE_t idx_n_elements = 0
+            ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size()
+
+        # Resizing buffers only once for the given number of elements.
+        for thread_num in range(num_threads):
+            idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
+
+        deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
+        deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
+
+        # Moving the elements by range using the range first element
+        # as the reference for the insertion.
+        for thread_num in range(num_threads):
+            move(
+                deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
+                deref(self.neigh_distances_chunks[thread_num])[idx].end(),
+                deref(self.neigh_distances)[idx].begin() + last_element_idx
+            )
+            move(
+                deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
+                deref(self.neigh_indices_chunks[thread_num])[idx].end(),
+                deref(self.neigh_indices)[idx].begin() + last_element_idx
+            )
+            last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
+
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
+
+        with nogil, parallel(num_threads=self.effective_n_threads):
+            # Merge vectors used in threads into the main ones.
+            # This is done in parallel sample-wise (no need for locks).
+            for idx in prange(self.n_samples_X, schedule='static'):
+                self._merge_vectors(idx, self.chunks_n_threads)
+
+            # The content of the vector have been std::moved.
+            # Hence they can't be used anymore and can be deleted.
+            # Their deletion is carried out automatically as the
+            # implementation relies on shared pointers.
+
+            # Sort in parallel in ascending order w.r.t the distances if requested.
+            if self.sort_results:
+                for idx in prange(self.n_samples_X, schedule='static'):
+                    simultaneous_sort(
+                        deref(self.neigh_distances)[idx].data(),
+                        deref(self.neigh_indices)[idx].data(),
+                        deref(self.neigh_indices)[idx].size()
+                    )
+
+        return
+
+    cdef void compute_exact_distances(self) nogil:
+        """Convert rank-preserving distances to pairwise distances in parallel."""
+        cdef:
+            ITYPE_t i, j
+
+        for i in prange(self.n_samples_X, nogil=True, schedule='static',
+                        num_threads=self.effective_n_threads):
+            for j in range(deref(self.neigh_indices)[i].size()):
+                deref(self.neigh_distances)[i][j] = (
+                        self.datasets_pair.distance_metric._rdist_to_dist(
+                            # Guard against eventual -0., causing nan production.
+                            max(deref(self.neigh_distances)[i][j], 0.)
+                        )
+                )
+
+
+cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesRadiusNeighborhood64):
+    """EuclideanDistance-specialized 64bit implementation for PairwiseDistancesRadiusNeighborhood."""
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return (PairwiseDistancesRadiusNeighborhood64.is_usable_for(X, Y, metric)
+                and not _in_unstable_openblas_configuration())
+
+    def __init__(
+        self,
+        X,
+        Y,
+        DTYPE_t radius,
+        bint use_squared_distances=False,
+        chunk_size=None,
+        strategy=None,
+        sort_results=False,
+        metric_kwargs=None,
+    ):
+        if (
+            metric_kwargs is not None and
+            len(metric_kwargs) > 0 and
+            "Y_norm_squared" not in metric_kwargs
+        ):
+            warnings.warn(
+                f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "
+                f"usable for this case (FastEuclideanPairwiseDistancesRadiusNeighborhood) and will be ignored.",
+                UserWarning,
+                stacklevel=3,
+            )
+
+        super().__init__(
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+            radius=radius,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            sort_results=sort_results,
+            metric_kwargs=metric_kwargs,
+        )
+        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        cdef:
+            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
+            ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk
+
+        self.gemm_term_computer = GEMMTermComputer64(
+            datasets_pair.X,
+            datasets_pair.Y,
+            self.effective_n_threads,
+            self.chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features=datasets_pair.X.shape[1],
+            chunk_size=self.chunk_size,
+        )
+
+        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
+            self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared")
+        else:
+            self.Y_norm_squared = _sqeuclidean_row_norms64(datasets_pair.Y, self.effective_n_threads)
+
+        # Do not recompute norms if datasets are identical.
+        self.X_norm_squared = (
+            self.Y_norm_squared if X is Y else
+            _sqeuclidean_row_norms64(datasets_pair.X, self.effective_n_threads)
+        )
+        self.use_squared_distances = use_squared_distances
+
+        if use_squared_distances:
+            # In this specialisation and this setup, the value passed to the radius is
+            # already considered to be the adapted radius, so we overwrite it.
+            self.r_radius = radius
+
+    @final
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood64._parallel_on_X_parallel_init(self, thread_num)
+        self.gemm_term_computer._parallel_on_X_parallel_init(thread_num)
+
+    @final
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood64._parallel_on_X_init_chunk(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end)
+
+    @final
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood64._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num,
+        )
+
+    @final
+    cdef void _parallel_on_Y_init(
+        self,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_init(self)
+        self.gemm_term_computer._parallel_on_Y_init()
+
+    @final
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end)
+        self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end)
+
+    @final
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            self,
+            X_start, X_end,
+            Y_start, Y_end,
+            thread_num,
+        )
+        self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+            X_start, X_end, Y_start, Y_end, thread_num
+        )
+
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        if not self.use_squared_distances:
+            PairwiseDistancesRadiusNeighborhood64.compute_exact_distances(self)
+
+    @final
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t squared_dist_i_j
+            ITYPE_t n_X = X_end - X_start
+            ITYPE_t n_Y = Y_end - Y_start
+            DTYPE_t *dist_middle_terms = self.gemm_term_computer._compute_distances_on_chunks(
+                X_start, X_end, Y_start, Y_end, thread_num
+            )
+
+        # Pushing the distance and their associated indices in vectors.
+        for i in range(n_X):
+            for j in range(n_Y):
+                # Using the squared euclidean distance as the rank-preserving distance:
+                #
+                #             ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                #
+                squared_dist_i_j = (
+                    self.X_norm_squared[i + X_start]
+                    + dist_middle_terms[i * n_Y + j]
+                    + self.Y_norm_squared[j + Y_start]
+                )
+                if squared_dist_i_j <= self.r_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
diff --git a/sklearn/metrics/_pairwise_distances_reduction/setup.py b/sklearn/metrics/_pairwise_distances_reduction/setup.py
new file mode 100644
index 0000000000000..0d8c2c8ce33de
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/setup.py
@@ -0,0 +1,38 @@
+import os
+
+import numpy as np
+from numpy.distutils.misc_util import Configuration
+
+
+def configuration(parent_package="", top_path=None):
+    config = Configuration("_pairwise_distances_reduction", parent_package, top_path)
+    libraries = []
+    if os.name == "posix":
+        libraries.append("m")
+
+    cython_sources = [
+        "_datasets_pair.pyx",
+        "_gemm_term_computer.pyx",
+        "_base.pyx",
+        "_argkmin.pyx",
+        "_radius_neighborhood.pyx",
+    ]
+
+    for source_file in cython_sources:
+        private_extension_name = source_file.replace(".pyx", "")
+        config.add_extension(
+            name=private_extension_name,
+            sources=[source_file],
+            include_dirs=[np.get_include()],
+            language="c++",
+            libraries=libraries,
+            extra_compile_args=["-std=c++11"],
+        )
+
+    return config
+
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+
+    setup(**configuration().todict())
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index fc912068cb6c4..e6e13a8c3e030 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -16,6 +16,7 @@ def configuration(parent_package="", top_path=None):
     config.add_subpackage("_plot")
     config.add_subpackage("_plot.tests")
     config.add_subpackage("cluster")
+    config.add_subpackage("_pairwise_distances_reduction")
 
     config.add_extension(
         "_pairwise_fast", sources=["_pairwise_fast.pyx"], libraries=libraries
@@ -35,15 +36,6 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
-    config.add_extension(
-        "_pairwise_distances_reduction",
-        sources=["_pairwise_distances_reduction.pyx"],
-        include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")],
-        language="c++",
-        libraries=libraries,
-        extra_compile_args=["-std=c++11"],
-    )
-
     config.add_subpackage("tests")
 
     return config
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index b47407f3754ee..0b9c6e6aad196 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -12,7 +12,7 @@
     PairwiseDistancesReduction,
     PairwiseDistancesArgKmin,
     PairwiseDistancesRadiusNeighborhood,
-    _sqeuclidean_row_norms64,
+    sqeuclidean_row_norms,
 )
 
 from sklearn.metrics import euclidean_distances
@@ -20,6 +20,7 @@
 from sklearn.utils._testing import (
     assert_array_equal,
     assert_allclose,
+    create_memmap_backed_data,
 )
 
 # Common supported metric between scipy.spatial.distance.cdist
@@ -666,7 +667,7 @@ def test_chunk_size_agnosticism(
     n_features=100,
     dtype=np.float64,
 ):
-    # Results should not depend on the chunk size
+    # Results must not depend on the chunk size
     rng = np.random.RandomState(global_random_seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
@@ -717,7 +718,7 @@ def test_n_threads_agnosticism(
     n_features=100,
     dtype=np.float64,
 ):
-    # Results should not depend on the number of threads
+    # Results must not depend on the number of threads
     rng = np.random.RandomState(global_random_seed)
     spread = 100
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
@@ -952,6 +953,57 @@ def test_pairwise_distances_radius_neighbors(
     )
 
 
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+)
+@pytest.mark.parametrize("metric", ["manhattan", "euclidean"])
+def test_memmap_backed_data(
+    metric,
+    PairwiseDistancesReduction,
+    n_samples=512,
+    n_features=100,
+    dtype=np.float64,
+):
+    # Results must not depend on the datasets writability
+    rng = np.random.RandomState(0)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    # Create read only datasets
+    X_mm, Y_mm = create_memmap_backed_data([X, Y])
+
+    if PairwiseDistancesReduction is PairwiseDistancesArgKmin:
+        parameter = 10
+        check_parameters = {}
+    else:
+        # Scaling the radius slightly with the numbers of dimensions
+        radius = 10 ** np.log(n_features)
+        parameter = radius
+        check_parameters = {"radius": radius}
+
+    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        return_distance=True,
+    )
+
+    dist_mm, indices_mm = PairwiseDistancesReduction.compute(
+        X_mm,
+        Y_mm,
+        parameter,
+        metric=metric,
+        return_distance=True,
+    )
+
+    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](
+        ref_dist, dist_mm, ref_indices, indices_mm, **check_parameters
+    )
+
+
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("n_features", [5, 10, 100])
 @pytest.mark.parametrize("num_threads", [1, 2, 8])
@@ -967,6 +1019,10 @@ def test_sqeuclidean_row_norms(
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
 
     sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
-    sq_row_norm = np.asarray(_sqeuclidean_row_norms64(X, num_threads=num_threads))
+    sq_row_norm = np.asarray(sqeuclidean_row_norms(X, num_threads=num_threads))
 
     assert_allclose(sq_row_norm_reference, sq_row_norm)
+
+    with pytest.raises(ValueError):
+        X = np.asfortranarray(X)
+        sqeuclidean_row_norms(X, num_threads=num_threads)