diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2e8958a2ab879..ce7a170103502 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -116,6 +116,8 @@ jobs:
     environment:
       - OMP_NUM_THREADS: 2
       - OPENBLAS_NUM_THREADS: 2
+      - NUMPY_VERSION: 'latest'
+      - SCIPY_VERSION: 'latest'
       - CYTHON_VERSION: 'latest'
       - JOBLIB_VERSION: 'latest'
       - THREADPOOLCTL_VERSION: 'latest'
diff --git a/build_tools/circle/build_test_arm.sh b/build_tools/circle/build_test_arm.sh
index 3d555f66227c4..9ad7418e855ca 100755
--- a/build_tools/circle/build_test_arm.sh
+++ b/build_tools/circle/build_test_arm.sh
@@ -21,39 +21,51 @@ source build_tools/shared.sh
 
 sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test
 sudo apt-get update
-sudo apt-get install python3-virtualenv ccache
-python3 -m virtualenv --system-site-packages --python=python3 testenv
-source testenv/bin/activate
-pip install --upgrade pip
+
+# Setup conda environment
+MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-aarch64.sh"
+
+# Install Mambaforge
+wget $MINICONDA_URL -O mambaforge.sh
+MINICONDA_PATH=$HOME/miniconda
+chmod +x mambaforge.sh && ./mambaforge.sh -b -p $MINICONDA_PATH
+export PATH=$MINICONDA_PATH/bin:$PATH
+mamba update --yes conda
+
+# Create environment and install dependencies
+mamba create -n testenv --yes python=3.7
+source activate testenv
+
+# Use the latest by default
+mamba install --verbose -y  ccache \
+                            pip \
+                            $(get_dep numpy $NUMPY_VERSION) \
+                            $(get_dep scipy $SCIPY_VERSION) \
+                            $(get_dep cython $CYTHON_VERSION) \
+                            $(get_dep joblib $JOBLIB_VERSION) \
+                            $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \
+                            $(get_dep pytest $PYTEST_VERSION) \
+                            $(get_dep pytest-xdist $PYTEST_XDIST_VERSION)
 setup_ccache
-python -m pip install $(get_dep cython $CYTHON_VERSION) \
-                      $(get_dep joblib $JOBLIB_VERSION)
-python -m pip install $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \
-                      $(get_dep pytest $PYTEST_VERSION) \
-                      $(get_dep pytest-xdist $PYTEST_XDIST_VERSION)
 
 if [[ "$COVERAGE" == "true" ]]; then
-    python -m pip install codecov pytest-cov
-fi
-
-if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then
-    python -m pip install pytest-xdist
+    mamba install --verbose -y codecov pytest-cov
 fi
 
 if [[ "$TEST_DOCSTRINGS" == "true" ]]; then
     # numpydoc requires sphinx
-    python -m pip install sphinx
-    python -m pip install numpydoc
+    mamba install --verbose -y sphinx
+    mamba install --verbose -y numpydoc
 fi
 
 python --version
+conda list
 
 # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI
 # workers with 2 cores when building the compiled extensions of scikit-learn.
 export SKLEARN_BUILD_PARALLEL=3
 
-python -m pip list
-pip install --verbose --editable .
+pip install --verbose --editable . --no-build-isolation
 ccache -s
 python -c "import sklearn; sklearn.show_versions()"
 python -m threadpoolctl --import sklearn
diff --git a/doc/glossary.rst b/doc/glossary.rst
index 010f16a361531..2b4c6af0d1866 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -644,9 +644,8 @@ General Concepts
 
         Note that for most distance metrics, we rely on implementations from
         :mod:`scipy.spatial.distance`, but may reimplement for efficiency in
-        our context.  The :mod:`neighbors` module also duplicates some metric
-        implementations for integration with efficient binary tree search data
-        structures.
+        our context. The :class:`metrics.DistanceMetric` interface is used to implement
+        distance metrics for integration with efficient neighbors search.
 
     pd
         A shorthand for `Pandas <https://pandas.pydata.org>`_ due to the
@@ -1023,7 +1022,7 @@ such as:
 
 Further examples:
 
-* :class:`neighbors.DistanceMetric`
+* :class:`metrics.DistanceMetric`
 * :class:`gaussian_process.kernels.Kernel`
 * ``tree.Criterion``
 
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 72b67b23e8dc3..b7000bcf7cbb2 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1058,6 +1058,16 @@ further details.
 
    metrics.consensus_score
 
+Distance metrics
+----------------
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   metrics.DistanceMetric
 
 Pairwise metrics
 ----------------
@@ -1317,7 +1327,6 @@ Model validation
    :template: class.rst
 
    neighbors.BallTree
-   neighbors.DistanceMetric
    neighbors.KDTree
    neighbors.KernelDensity
    neighbors.KNeighborsClassifier
diff --git a/doc/modules/density.rst b/doc/modules/density.rst
index 115d318183577..6440bf79ab729 100644
--- a/doc/modules/density.rst
+++ b/doc/modules/density.rst
@@ -136,7 +136,7 @@ The form of these kernels is as follows:
   :math:`K(x; h) \propto \cos(\frac{\pi x}{2h})` if :math:`x < h`
 
 The kernel density estimator can be used with any of the valid distance
-metrics (see :class:`~sklearn.neighbors.DistanceMetric` for a list of available metrics), though
+metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of available metrics), though
 the results are properly normalized only for the Euclidean metric.  One
 particularly useful metric is the
 `Haversine distance <https://en.wikipedia.org/wiki/Haversine_formula>`_
diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index c400f5ba57685..91322dba632d6 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -523,7 +523,9 @@ def predict(self, X):
 
         if self.cluster_centers_.shape[0] > 0:
             with config_context(assume_finite=True):
-                return pairwise_distances_argmin(X, self.cluster_centers_)
+                return pairwise_distances_argmin(
+                    X, self.cluster_centers_, metric="fast_euclidean"
+                )
         else:
             warnings.warn(
                 "This model does not have any cluster centers "
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 6606f370b81eb..70b3a5028169b 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -16,8 +16,8 @@
 
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics.pairwise import paired_distances
-from ..neighbors import DistanceMetric
-from ..neighbors._dist_metrics import METRIC_MAPPING
+from ..metrics import DistanceMetric
+from ..metrics._dist_metrics import METRIC_MAPPING
 from ..utils import check_array
 from ..utils._fast_dict import IntFloatDict
 from ..utils.fixes import _astype_copy_false
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 78c15bb8e1a15..d2dd3f937a27d 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -12,7 +12,6 @@
 from ..metrics import pairwise_distances_argmin
 from ..metrics.pairwise import euclidean_distances
 from ..base import TransformerMixin, ClusterMixin, BaseEstimator
-from ..utils.extmath import row_norms
 from ..utils import deprecated
 from ..utils.validation import check_is_fitted
 from ..exceptions import ConvergenceWarning
@@ -654,11 +653,10 @@ def predict(self, X):
         """
         check_is_fitted(self)
         X = self._validate_data(X, accept_sparse="csr", reset=False)
-        kwargs = {"Y_norm_squared": self._subcluster_norms}
 
         with config_context(assume_finite=True):
             argmin = pairwise_distances_argmin(
-                X, self.subcluster_centers_, metric_kwargs=kwargs
+                X, self.subcluster_centers_, metric="fast_euclidean"
             )
         return self.subcluster_labels_[argmin]
 
@@ -704,9 +702,6 @@ def _global_clustering(self, X=None):
                 "n_clusters should be an instance of ClusterMixin or an int"
             )
 
-        # To use in predict to avoid recalculation.
-        self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
-
         if clusterer is None or not_enough_centroids:
             self.subcluster_labels_ = np.arange(len(centroids))
             if not_enough_centroids:
diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx
index 2a58757ce327d..11ea3294c086a 100644
--- a/sklearn/cluster/_hierarchical_fast.pyx
+++ b/sklearn/cluster/_hierarchical_fast.pyx
@@ -13,7 +13,7 @@ ctypedef np.int8_t INT8
 
 np.import_array()
 
-from ..neighbors._dist_metrics cimport DistanceMetric
+from ..metrics._dist_metrics cimport DistanceMetric
 from ..utils._fast_dict cimport IntFloatDict
 
 # C++
@@ -236,8 +236,8 @@ def max_merge(IntFloatDict a, IntFloatDict b,
 def average_merge(IntFloatDict a, IntFloatDict b,
               np.ndarray[ITYPE_t, ndim=1] mask,
               ITYPE_t n_a, ITYPE_t n_b):
-    """Merge two IntFloatDicts with the average strategy: when the 
-    same key is present in the two dicts, the weighted average of the two 
+    """Merge two IntFloatDicts with the average strategy: when the
+    same key is present in the two dicts, the weighted average of the two
     values is used.
 
     Parameters
@@ -290,13 +290,13 @@ def average_merge(IntFloatDict a, IntFloatDict b,
 
 
 ###############################################################################
-# An edge object for fast comparisons 
+# An edge object for fast comparisons
 
 cdef class WeightedEdge:
     cdef public ITYPE_t a
     cdef public ITYPE_t b
     cdef public DTYPE_t weight
-    
+
     def __init__(self, DTYPE_t weight, ITYPE_t a, ITYPE_t b):
         self.weight = weight
         self.a = a
@@ -326,7 +326,7 @@ cdef class WeightedEdge:
             return self.weight > other.weight
         elif op == 5:
             return self.weight >= other.weight
-        
+
     def __repr__(self):
         return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
                                               self.weight,
@@ -475,7 +475,7 @@ def mst_linkage_core(
 
     dist_metric: DistanceMetric
         A DistanceMetric object conforming to the API from
-        ``sklearn.neighbors._dist_metrics.pxd`` that will be
+        ``sklearn.metrics._dist_metrics.pxd`` that will be
         used to compute distances.
 
     Returns
@@ -534,4 +534,3 @@ def mst_linkage_core(
         current_node = new_node
 
     return np.array(result)
-
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index cc3930891d880..542ed0dbc97aa 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -512,4 +512,6 @@ def predict(self, X):
         check_is_fitted(self)
         X = self._validate_data(X, reset=False)
         with config_context(assume_finite=True):
-            return pairwise_distances_argmin(X, self.cluster_centers_)
+            return pairwise_distances_argmin(
+                X, self.cluster_centers_, metric="fast_euclidean"
+            )
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index 92f92dc3736e3..3525643383c26 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -17,7 +17,7 @@
 from scipy.sparse.csgraph import connected_components
 
 from sklearn.metrics.cluster import adjusted_rand_score
-from sklearn.neighbors.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
+from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
 from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import ignore_warnings
@@ -31,6 +31,7 @@
     _fix_connectivity,
 )
 from sklearn.feature_extraction.image import grid_to_graph
+from sklearn.metrics import DistanceMetric
 from sklearn.metrics.pairwise import (
     PAIRED_DISTANCES,
     cosine_distances,
@@ -38,7 +39,7 @@
     pairwise_distances,
 )
 from sklearn.metrics.cluster import normalized_mutual_info_score
-from sklearn.neighbors import kneighbors_graph, DistanceMetric
+from sklearn.neighbors import kneighbors_graph
 from sklearn.cluster._hierarchical_fast import (
     average_merge,
     max_merge,
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 46958ea4ef7f8..e4339229c5b64 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -36,6 +36,8 @@
 from ._classification import brier_score_loss
 from ._classification import multilabel_confusion_matrix
 
+from ._dist_metrics import DistanceMetric
+
 from . import cluster
 from .cluster import adjusted_mutual_info_score
 from .cluster import adjusted_rand_score
@@ -115,6 +117,7 @@
     "davies_bouldin_score",
     "DetCurveDisplay",
     "det_curve",
+    "DistanceMetric",
     "euclidean_distances",
     "explained_variance_score",
     "f1_score",
diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
similarity index 64%
rename from sklearn/neighbors/_dist_metrics.pxd
rename to sklearn/metrics/_dist_metrics.pxd
index 5b223f8c6d8a8..e87f442019a9d 100644
--- a/sklearn/neighbors/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -1,14 +1,13 @@
 #!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: cdivision=True
+# cython: boundscheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: wraparound=False
 
-cimport cython
 cimport numpy as np
-from libc.math cimport fabs, sqrt, exp, cos, pow
+from libc.math cimport sqrt, exp
 
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
-from ._typedefs import DTYPE, ITYPE
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 
 ######################################################################
 # Inline distance functions
@@ -60,9 +59,25 @@ cdef class DistanceMetric:
     cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                       ITYPE_t size) nogil except -1
 
-    cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1
 
+    cdef DTYPE_t csr_dist(
+        self,
+        const DTYPE_t[:] x1_data,
+        const ITYPE_t[:] x1_indices,
+        const DTYPE_t[:] x2_data,
+        const ITYPE_t[:] x2_indices,
+    ) nogil except -1
+
+    cdef DTYPE_t csr_rdist(
+        self,
+        const DTYPE_t[:] x1_data,
+        const ITYPE_t[:] x1_indices,
+        const DTYPE_t[:] x2_data,
+        const ITYPE_t[:] x2_indices,
+    ) nogil except -1
+
     cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
 
     cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y,
@@ -71,3 +86,24 @@ cdef class DistanceMetric:
     cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1
 
     cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1
+
+
+######################################################################
+# DatasetsPair base class
+cdef class DatasetsPair:
+    cdef DistanceMetric distance_metric
+
+    cdef ITYPE_t n_X(self) nogil
+
+    cdef ITYPE_t n_Y(self) nogil
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+
+cdef class DenseDenseDatasetsPair(DatasetsPair):
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+        ITYPE_t d
diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
similarity index 73%
rename from sklearn/neighbors/_dist_metrics.pyx
rename to sklearn/metrics/_dist_metrics.pyx
index 240a7a3f7d14d..f75a3a2a75fcb 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1,8 +1,8 @@
-#!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: initializedcheck=False
-#cython: cdivision=True
+# cython: boundscheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: wraparound=False
+
 
 # By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
 # written for the scikit-learn project
@@ -10,6 +10,8 @@
 
 import numpy as np
 cimport numpy as np
+from cython cimport final
+
 np.import_array()  # required in order to use C-API
 
 
@@ -19,7 +21,7 @@ cdef extern from "arrayobject.h":
                                      int typenum, void* data)
 
 
-cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
+cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n):
     # Wrap a memory buffer with an ndarray. Warning: this is not robust.
     # In particular, if x is deallocated before the returned array goes
     # out of scope, this could cause memory errors.  Since there is not
@@ -29,13 +31,14 @@ cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
     return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
 
 
-# some handy constants
 from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
 cdef DTYPE_t INF = np.inf
 
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
-from ._typedefs import DTYPE, ITYPE
+from scipy.sparse import csr_matrix, issparse
 
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
+from ..utils._typedefs import DTYPE, ITYPE
+from ..utils import check_array
 
 ######################################################################
 # newObj function
@@ -73,6 +76,16 @@ METRIC_MAPPING = {'euclidean': EuclideanDistance,
                   'haversine': HaversineDistance,
                   'pyfunc': PyFuncDistance}
 
+BOOL_METRICS = [
+    "matching",
+    "jaccard",
+    "dice",
+    "kulsinski",
+    "rogerstanimoto",
+    "russellrao",
+    "sokalmichener",
+    "sokalsneath",
+]
 
 def get_valid_metric_ids(L):
     """Given an iterable of metric class names or class identifiers,
@@ -98,7 +111,7 @@ cdef class DistanceMetric:
 
     Examples
     --------
-    >>> from sklearn.neighbors import DistanceMetric
+    >>> from sklearn.metrics import DistanceMetric
     >>> dist = DistanceMetric.get_metric('euclidean')
     >>> X = [[0, 1, 2],
              [3, 4, 5]]
@@ -197,8 +210,8 @@ cdef class DistanceMetric:
     """
     def __cinit__(self):
         self.p = 2
-        self.vec = np.zeros(1, dtype=DTYPE, order='c')
-        self.mat = np.zeros((1, 1), dtype=DTYPE, order='c')
+        self.vec = np.zeros(1, dtype=DTYPE, order='C')
+        self.mat = np.zeros((1, 1), dtype=DTYPE, order='C')
         self.size = 1
 
     def __reduce__(self):
@@ -291,17 +304,50 @@ cdef class DistanceMetric:
 
     cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1:
-        """Compute the reduced distance between vectors x1 and x2.
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2.
 
         This can optionally be overridden in a base class.
 
-        The reduced distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute.  For example, for the
-        Euclidean metric, the reduced distance is the squared-euclidean
-        distance.
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute.  For example, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
         """
         return self.dist(x1, x2, size)
 
+    cdef DTYPE_t csr_dist(
+        self,
+        const DTYPE_t[:] x1_data,
+        const ITYPE_t[:] x1_indices,
+        const DTYPE_t[:] x2_data,
+        const ITYPE_t[:] x2_indices,
+    ) nogil except -1:
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2
+        given non null coordinates and their corresponding indices.
+
+        This should be overridden in a base class.
+        """
+        return -999
+
+    cdef DTYPE_t csr_rdist(
+        self,
+        const DTYPE_t[:] x1_data,
+        const ITYPE_t[:] x1_indices,
+        const DTYPE_t[:] x2_data,
+        const ITYPE_t[:] x2_indices,
+    ) nogil except -1:
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2
+        given non null coordinates and their corresponding indices.
+
+        This can optionally be overridden in a base class.
+
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute.  For example, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
+        """
+        return self.csr_dist(x1_data, x1_indices, x2_data, x2_indices)
+
     cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1:
         """compute the pairwise distances between points in X"""
         cdef ITYPE_t i1, i2
@@ -323,25 +369,25 @@ cdef class DistanceMetric:
         return 0
 
     cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        """Convert the reduced distance to the distance"""
+        """Convert the rank-preserving surrogate distance to the distance"""
         return rdist
 
     cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        """Convert the distance to the reduced distance"""
+        """Convert the distance to the rank-preserving surrogate distance"""
         return dist
 
     def rdist_to_dist(self, rdist):
-        """Convert the Reduced distance to the true distance.
+        """Convert the rank-preserving surrogate distance to the true distance.
 
-        The reduced distance, defined for some metrics, is a computationally
-        more efficient measure which preserves the rank of the true distance.
-        For example, in the Euclidean distance metric, the reduced distance
-        is the squared-euclidean distance.
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute.  For example, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
 
         Parameters
         ----------
         rdist : double
-            Reduced distance.
+            Rank-preserving surrogate distance.
 
         Returns
         -------
@@ -351,12 +397,12 @@ cdef class DistanceMetric:
         return rdist
 
     def dist_to_rdist(self, dist):
-        """Convert the true distance to the reduced distance.
+        """Convert the true distance to the rank-preserving surrogate distance.
 
-        The reduced distance, defined for some metrics, is a computationally
-        more efficient measure which preserves the rank of the true distance.
-        For example, in the Euclidean distance metric, the reduced distance
-        is the squared-euclidean distance.
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute.  For example, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
 
         Parameters
         ----------
@@ -366,7 +412,7 @@ cdef class DistanceMetric:
         Returns
         -------
         double
-            Reduced distance.
+            Rank-preserving surrogate distance.
         """
         return dist
 
@@ -519,7 +565,7 @@ cdef class ChebyshevDistance(DistanceMetric):
 
     Examples
     --------
-    >>> from sklearn.neighbors.dist_metrics import DistanceMetric
+    >>> from sklearn.metrics import DistanceMetric
     >>> dist = DistanceMetric.get_metric('chebyshev')
     >>> X = [[0, 1, 2],
     ...      [3, 4, 5]]
@@ -1145,3 +1191,340 @@ cdef class PyFuncDistance(DistanceMetric):
 
 cdef inline double fmax(double a, double b) nogil:
     return max(a, b)
+
+
+######################################################################
+# Datasets Pair Classes
+cdef class DatasetsPair:
+    """Abstract class which wraps a pair of datasets (X, Y).
+
+    This class allows computing distances between two vectors (X_i, Y_j)
+    (rows of X and Y) at a time given the pair of their indices (i, j).
+
+    X and Y can be stored as np.ndarrays or CSR matrices in subclasses.
+
+    This class avoids the overhead of dispatching distance computations
+    to :class:`sklearn.metrics.DistanceMetric` based on the physical
+    representation of the vectors (sparse vs. dense). It makes use of
+    cython.final to remove the overhead of method calls' dispatch.
+
+    Parameters
+    ----------
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        str metric="euclidean",
+        dict metric_kwargs=None,
+    ) -> DatasetsPair:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_X, d)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Y : {ndarray, sparse matrix} of shape (n_Y, d)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        metric : str, default='euclidean'
+            The distance metric to use for argkmin. The default metric is
+            a fast implementation of the standard Euclidean metric.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        Returns
+        -------
+        datasets_pair: DatasetsPair
+            The suited DatasetsPair implementation.
+        """
+        cdef:
+            DistanceMetric distance_metric = DistanceMetric.get_metric(
+                metric,
+                **(metric_kwargs or {})
+            )
+
+        if X.dtype != np.float64 or Y.dtype != np.float64:
+            raise ValueError("Only 64bit float datasets are supported for X and Y.")
+
+        # Metric-specific checks that do not replace nor duplicate `check_array`.
+        distance_metric._validate_data(X)
+        distance_metric._validate_data(Y)
+
+        if not issparse(X) and not issparse(Y):
+            return DenseDenseDatasetsPair(X, Y, distance_metric)
+        if issparse(X) and not issparse(Y):
+            return SparseDenseDatasetsPair(X, Y, distance_metric)
+        if not issparse(X) and issparse(Y):
+            return DenseSparseDatasetsPair(X, Y, distance_metric)
+        return SparseSparseDatasetsPair(X, Y, distance_metric)
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure getting ITYPE instead of int internally used for CSR matrices."""
+        X_data = np.asarray(X.data, dtype=DTYPE)
+        X_indices = np.asarray(X.indices, dtype=ITYPE)
+        X_indptr = np.asarray(X.indptr, dtype=ITYPE)
+        return X_data, X_indptr, X_indptr
+
+    def __init__(self, DistanceMetric distance_metric):
+        self.distance_metric = distance_metric
+
+    cdef ITYPE_t n_X(self) nogil:
+        """Number of samples in X."""
+        return -999
+
+    cdef ITYPE_t n_Y(self) nogil:
+        """Number of samples in Y."""
+        return -999
+
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.dist(i, j)
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return -1
+
+@final
+cdef class DenseDenseDatasetsPair(DatasetsPair):
+    """Compute distances between vectors of two arrays.
+
+    Parameters
+    ----------
+    X: ndarray of shape (n_X, d)
+        Rows represent vectors. Must be C-contiguous.
+
+    Y: ndarray of shape (n_Y, d)
+        Rows represent vectors. Must be C-contiguous.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        super().__init__(distance_metric)
+        # Arrays have already been checked
+        self.X = X
+        self.Y = Y
+        self.d = X.shape[1]
+
+    @final
+    cdef ITYPE_t n_X(self) nogil:
+        return self.X.shape[0]
+
+    @final
+    cdef ITYPE_t n_Y(self) nogil:
+        return self.Y.shape[0]
+
+    @final
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.distance_metric.rdist(&self.X[i, 0],
+                                          &self.Y[j, 0],
+                                          self.d)
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.distance_metric.dist(&self.X[i, 0],
+                                         &self.Y[j, 0],
+                                         self.d)
+
+@final
+cdef class SparseSparseDatasetsPair(DatasetsPair):
+    """Compute distances between vectors of two CSR matrices.
+
+    Parameters
+    ----------
+    X: sparse matrix of shape (n_X, d)
+        Rows represent vectors. Must be in CSR format.
+
+    Y: sparse matrix of shape (n_X, d)
+        Rows represent vectors. Must be in CSR format.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+    cdef:
+        const DTYPE_t[:] X_data
+        const ITYPE_t[:] X_indices,
+        const ITYPE_t[:] X_indptr,
+
+        const DTYPE_t[:] Y_data
+        const ITYPE_t[:] Y_indices
+        const ITYPE_t[:] Y_indptr
+
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        DatasetsPair.__init__(self, distance_metric)
+
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
+
+    @final
+    cdef ITYPE_t n_X(self) nogil:
+        return self.X_indptr.shape[0] - 1
+
+    @final
+    cdef ITYPE_t n_Y(self) nogil:
+        return self.Y_indptr.shape[0] -1
+
+    @final
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+            ITYPE_t yj_start = self.Y_indptr[j]
+            ITYPE_t yj_end = self.Y_indptr[j + 1]
+
+        return self.distance_metric.csr_rdist(
+            self.X_data[xi_start:xi_end],
+            self.X_indices[xi_start:xi_end],
+            self.Y_data[yj_start:yj_end],
+            self.Y_indices[yj_start:yj_end],
+        )
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+            ITYPE_t yj_start = self.Y_indptr[j]
+            ITYPE_t yj_end = self.Y_indptr[j + 1]
+
+        return self.distance_metric.csr_dist(
+            self.X_data[xi_start:xi_end],
+            self.X_indices[xi_start:xi_end],
+            self.Y_data[yj_start:yj_end],
+            self.Y_indices[yj_start:yj_end]
+        )
+
+@final
+cdef class SparseDenseDatasetsPair(DatasetsPair):
+    """Compute distances between vectors of a CSR matrix and a dense array.
+
+    Parameters
+    ----------
+    X: sparse matrix of shape (n_X, d)
+        Rows represent vectors. Must be in CSR format.
+
+    Y: ndarray of shape (n_Y, d)
+        Rows represent vectors. Must be C-contiguous.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+    cdef:
+        const DTYPE_t[:] X_data
+        const ITYPE_t[:] X_indices,
+        const ITYPE_t[:] X_indptr,
+
+        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
+        const ITYPE_t[:] Y_indices
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        super().__init__(distance_metric)
+
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+
+        # This array already has been checked here
+        self.Y = Y
+        self.Y_indices = np.arange(self.Y.shape[1], dtype=ITYPE)
+
+    @final
+    cdef ITYPE_t n_X(self) nogil:
+        return self.X_indptr.shape[0] - 1
+
+    @final
+    cdef ITYPE_t n_Y(self) nogil:
+        return self.Y.shape[0]
+
+    @final
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+
+        # TODO: the 2D to 1D memory-view conversion might make computation slower, see:
+        # https://github.com/scikit-learn/scikit-learn/issues/17299
+        # Ideally, we could pass pointers and indices and access elements
+        # then in distance_metric.dist
+        return self.distance_metric.csr_rdist(
+            self.X_data[xi_start:xi_end],
+            self.X_indices[xi_start:xi_end],
+            self.Y[j, :],
+            self.Y_indices
+        )
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        cdef:
+            ITYPE_t xi_start = self.X_indptr[i]
+            ITYPE_t xi_end = self.X_indptr[i + 1]
+
+        # TODO: same as previous comment
+        return self.distance_metric.csr_dist(
+            self.X_data[xi_start:xi_end],
+            self.X_indices[xi_start:xi_end],
+            self.Y[j, :],
+            self.Y_indices
+        )
+
+@final
+cdef class DenseSparseDatasetsPair(DatasetsPair):
+    """Compute distances between vectors of a dense array and a CSR matrix.
+
+    Parameters
+    ----------
+    X: ndarray of shape (n_X, d)
+        Rows represent vectors. Must be C-contiguous.
+
+    Y: sparse matrix of shape (n_Y, d)
+        Rows represent vectors. Must be in CSR format.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+    cdef:
+        # As distance metrics are symmetric functions, we can
+        # simply rely on the other DatasetsPair and swap arguments.
+        DatasetsPair datasets_pair
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        super().__init__(distance_metric)
+        # Swapping arguments on the constructor
+        self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
+
+    @final
+    cdef ITYPE_t n_X(self) nogil:
+        # Swapping interface
+        return self.datasets_pair.n_Y()
+
+    @final
+    cdef ITYPE_t n_Y(self) nogil:
+        # Swapping interface
+        return self.datasets_pair.n_X()
+
+    @final
+    cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        # Swapping arguments on the same interface
+        return self.datasets_pair.ranking_preserving_dist(j, i)
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        # Swapping arguments on the same interface
+        return self.datasets_pair.dist(j, i)
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
new file mode 100644
index 0000000000000..19f29681c311f
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -0,0 +1,1852 @@
+# cython: boundscheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: wraparound=False
+# distutils: language=c++
+
+# Pairwise Distances Reductions
+# =============================
+#
+#    Author: Julien Jerphanion <git@jjerphan.xyz>
+#
+#
+# The routines defined here are used in various algorithms performing
+# the same structure of operations on distances between vectors
+# of a datasets pair (X, Y).
+
+import numpy as np
+cimport numpy as np
+
+from .. import get_config
+
+np.import_array()
+
+from libc.stdlib cimport free, malloc
+from libc.float cimport DBL_MAX
+from libc.math cimport exp
+from libcpp.vector cimport vector
+from cython cimport final
+from cpython.object cimport PyObject
+from cython.operator cimport dereference as deref
+from cython.parallel cimport parallel, prange
+from cpython.ref cimport Py_INCREF
+
+from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair
+from ..utils._cython_blas cimport (
+  BLAS_Order,
+  BLAS_Trans,
+  ColMajor,
+  NoTrans,
+  RowMajor,
+  Trans,
+  _dot,
+  _gemm,
+)
+from ..utils._heap cimport simultaneous_sort, heap_push
+from ..utils._openmp_helpers cimport _openmp_thread_num
+from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
+from ..utils._typedefs cimport ITYPECODE, DTYPECODE
+
+from numbers import Integral, Real
+from typing import List
+from scipy.sparse import issparse
+from threadpoolctl import threadpool_limits
+from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
+from ..utils import check_scalar, _in_unstable_openblas_configuration
+from ..utils._openmp_helpers import _openmp_effective_n_threads
+from ..utils._typedefs import ITYPE, DTYPE
+
+# Those constants have been chosen for modern laptops' caches and architecture.
+DEF CHUNK_SIZE = 256  # number of vectors
+DEF MIN_CHUNK_SAMPLES = 20
+
+
+# TODO: change for `libcpp.algorithm.move` once Cython 3 is used
+# Introduction in Cython:
+# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa
+cdef extern from "<algorithm>" namespace "std" nogil:
+    OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa
+
+######################
+## std::vector to np.ndarray coercion
+# As type covariance is not supported for C++ containers via Cython,
+# we need to redefine fused types.
+ctypedef fused vector_DITYPE_t:
+    vector[ITYPE_t]
+    vector[DTYPE_t]
+
+
+ctypedef fused vector_vector_DITYPE_t:
+    vector[vector[ITYPE_t]]
+    vector[vector[DTYPE_t]]
+
+
+cdef class StdVectorSentinel:
+    """Wraps a reference to a vector which will be deallocated with this object.
+
+    When created, the StdVectorSentinel swaps the reference of its internal
+    vectors with the provided one (vec_ptr), thus making the StdVectorSentinel
+    manage the provided one's lifetime.
+    """
+    pass
+
+
+# We necessarily need to define two extension types extending StdVectorSentinel
+# because we need to provide the dtype of the vector but can't use numeric fused types.
+cdef class StdVectorSentinelDTYPE(StdVectorSentinel):
+    cdef vector[DTYPE_t] vec
+
+    @staticmethod
+    cdef StdVectorSentinel create_for(vector[DTYPE_t] * vec_ptr):
+        # This initializes the object directly without calling __init__
+        cdef StdVectorSentinelDTYPE sentinel = StdVectorSentinelDTYPE.__new__(StdVectorSentinelDTYPE)
+        sentinel.vec.swap(deref(vec_ptr))
+        return sentinel
+
+
+cdef class StdVectorSentinelITYPE(StdVectorSentinel):
+    cdef vector[ITYPE_t] vec
+
+    @staticmethod
+    cdef StdVectorSentinel create_for(vector[ITYPE_t] * vec_ptr):
+        # This initializes the object directly without calling __init__
+        cdef StdVectorSentinelITYPE sentinel = StdVectorSentinelITYPE.__new__(StdVectorSentinelITYPE)
+        sentinel.vec.swap(deref(vec_ptr))
+        return sentinel
+
+
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
+    const DTYPE_t[:, ::1] X,
+    ITYPE_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
+        ITYPE_t idx = 0
+        ITYPE_t n = X.shape[0]
+        ITYPE_t d = X.shape[1]
+        DTYPE_t[::1] row_norms = np.empty(n, dtype=DTYPE)
+
+    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
+
+    return row_norms
+
+cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr):
+    """Create a numpy ndarray given a C++ vector.
+
+    The numpy array buffer is the one of the C++ vector.
+    A StdVectorSentinel is registered as the base object for the numpy array,
+    freeing the C++ vector it encapsulates when the numpy array is freed.
+    """
+    typenum = DTYPECODE if vector_DITYPE_t is vector[DTYPE_t] else ITYPECODE
+    cdef:
+        np.npy_intp size = deref(vect_ptr).size()
+        np.ndarray arr = np.PyArray_SimpleNewFromData(1, &size, typenum,
+                                                      deref(vect_ptr).data())
+        StdVectorSentinel sentinel
+
+    if vector_DITYPE_t is vector[DTYPE_t]:
+        sentinel = StdVectorSentinelDTYPE.create_for(vect_ptr)
+    else:
+        sentinel = StdVectorSentinelITYPE.create_for(vect_ptr)
+
+    # Makes the numpy array responsible of the life-cycle of its buffer.
+    # A reference to the StdVectorSentinel will be stolen by the call bellow,
+    # so we increase its reference counter.
+    # See: https://docs.python.org/3/c-api/intro.html#reference-count-details
+    Py_INCREF(sentinel)
+    np.PyArray_SetBaseObject(arr, sentinel)
+    return arr
+
+
+cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays(
+    vector_vector_DITYPE_t* vecs
+):
+    """Coerce a std::vector of std::vector to a ndarray of ndarray."""
+    cdef:
+        ITYPE_t n = deref(vecs).size()
+        np.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n,
+                                                                     dtype=np.ndarray)
+
+    for i in range(n):
+        nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i]))
+
+    return nd_arrays_of_nd_arrays
+
+#####################
+
+cdef class PairwiseDistancesReduction:
+    """Abstract class which compute pairwise distances between
+    a set of vectors (rows) X and another set of vectors (rows) of Y
+    and apply a reduction on top.
+
+    The computations of the distances and the reduction is parallelized
+    on chunks of vectors of X and Y.
+
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The pair of dataset to use.
+
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use 256 if it is not set.
+
+    n_threads: int, default=None
+        The number of OpenMP threads to use for the reduction.
+        Parallelism is done on chunks and the sharding of chunks
+        depends on the `strategy` set on :method:`~PairwiseDistancesReduction.compute`.
+
+        None and -1 means using all processors.
+    """
+
+    cdef:
+        DatasetsPair _datasets_pair
+
+        ITYPE_t n_threads
+        ITYPE_t effective_omp_n_thread
+        ITYPE_t n_samples_chunk, chunk_size
+
+        ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder
+        ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder
+
+    @classmethod
+    def valid_metrics(cls) -> List[str]:
+        excluded = {
+            "pyfunc",  # is relatively slow because we need to coerce data as np arrays
+            "mahalanobis", # is numerically unstable
+            # TODO: In order to support discrete distance metrics, we need to have a
+            # simultaneous sort which breaks ties on indices when distances are identical.
+            # The best might be using a std::sort and a Comparator which might need
+            # AoS instead of SoA (currently used).
+            "hamming",
+            *BOOL_METRICS,
+        }
+        return sorted({"fast_euclidean", "fast_sqeuclidean",
+                       *METRIC_MAPPING.keys()}.difference(excluded))
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        """Return True if the PairwiseDistancesReduction for the given parameters.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_X, d)
+            Input data.
+
+        Y : {ndarray, sparse matrix} of shape (n_Y, d)
+            Input data.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        Returns
+        -------
+        True if the PairwiseDistancesReduction can be used, else False.
+        """
+        # Coercing to np.array to get the dtype
+        # TODO: what is the best way to get lists' dtype?
+        X = np.asarray(X) if isinstance(X, (tuple, list)) else X
+        Y = np.asarray(Y) if isinstance(Y, (tuple, list)) else Y
+        # TODO: support sparse arrays and 32 bits
+        return (not issparse(X) and X.dtype == np.float64 and X.ndim == 2 and
+                not issparse(Y) and Y.dtype == np.float64 and Y.ndim == 2 and
+                metric in cls.valid_metrics())
+
+    @property
+    def datasets_pair(self) -> DatasetsPair:
+        return self._datasets_pair
+
+    def __init__(
+        self,
+        DatasetsPair datasets_pair,
+        chunk_size=None,
+        n_threads=None,
+     ):
+        cdef:
+            ITYPE_t X_n_full_chunks, Y_n_full_chunks
+
+        if chunk_size is None:
+            chunk_size = get_config().get("pairwise_dist_chunk_size", CHUNK_SIZE)
+
+        check_scalar(chunk_size, "chunk_size", Integral, min_val=1)
+        self.chunk_size = chunk_size
+
+        self.effective_omp_n_thread = _openmp_effective_n_threads(n_threads)
+
+        self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+        self._datasets_pair = datasets_pair
+
+        self.n_Y = datasets_pair.n_Y()
+        self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk)
+        Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk
+        self.Y_n_samples_remainder = self.n_Y % self.Y_n_samples_chunk
+
+        self.n_X = datasets_pair.n_X()
+        self.X_n_samples_chunk = min(self.n_X, self.n_samples_chunk)
+        X_n_full_chunks = self.n_X // self.X_n_samples_chunk
+        self.X_n_samples_remainder = self.n_X % self.X_n_samples_chunk
+
+        # Counting remainder chunk in total number of chunks
+        self.Y_n_chunks = Y_n_full_chunks + (
+            self.n_Y != (Y_n_full_chunks * self.Y_n_samples_chunk)
+        )
+
+        self.X_n_chunks = X_n_full_chunks + (
+            self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
+        )
+
+    def compute(
+        self,
+        str strategy=None,
+        bint return_distance=False,
+    ):
+        """Computes the reduction of vectors (rows) of X on Y.
+
+        Parameters
+        ----------
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            Strategies differs on the dispatching they use for chunks on threads:
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+              Each thread then iterates on all the chunks of Y. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation
+              but is less used in practice (because X is smaller than Y generally).
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+              Each thread then iterates on all the chunks of X. This strategy is
+              embarrassingly parallel but uses intermediate datastructures
+              synchronisation. However it is more useful in practice (because Y is
+              larger than X generally).
+              - 'auto' relies on a simple heuristic to choose between
+              'parallel_on_X' and 'parallel_on_Y'.
+              - None (default) looks-up in scikit-learn configuration for
+              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its
+            argkmin if set to True.
+
+        Returns
+        -------
+        Results for the PairwiseDistancesReduction, usually an array of indices
+        and optionally an array of associated distances if return_distance is True.
+        """
+
+        if strategy is None:
+            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X:
+                strategy = 'parallel_on_X'
+            else:
+                strategy = 'parallel_on_Y'
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if strategy == 'parallel_on_Y':
+                self._parallel_on_Y()
+            elif strategy == 'parallel_on_X':
+                self._parallel_on_X()
+            else:
+                raise RuntimeError(f"strategy '{strategy}' not supported.")
+
+        return self._finalize_results(return_distance)
+
+    @final
+    cdef void _parallel_on_X(self) nogil:
+        """Computes the reduction of each vector (row) of X on Y
+        by parallelizing computation on chunks of X.
+
+        This strategy dispatches chunks of X uniformly on threads.
+        Each thread then iterates on all the chunks of Y. This strategy is
+        embarrassingly parallel and comes with no datastructures synchronisation.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t thread_num
+
+        with nogil, parallel(num_threads=num_threads):
+            thread_num = _openmp_thread_num()
+
+            # Allocating thread datastructures
+            self._on_X_parallel_init(thread_num)
+
+            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
+                X_start = X_chunk_idx * self.X_n_samples_chunk
+                if (X_chunk_idx == self.X_n_chunks - 1
+                    and self.X_n_samples_remainder > 0):
+                    X_end = X_start + self.X_n_samples_remainder
+                else:
+                    X_end = X_start + self.X_n_samples_chunk
+
+                # Reinitializing thread datastructures for the new X chunk
+                self._on_X_prange_iter_init(thread_num, X_start, X_end)
+
+                for Y_chunk_idx in range(self.Y_n_chunks):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if (Y_chunk_idx == self.Y_n_chunks - 1
+                        and self.Y_n_samples_remainder > 0):
+                        Y_end = Y_start + self.Y_n_samples_remainder
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    self._compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
+                # Adjusting thread datastructures on the full pass on Y
+                self._on_X_prange_iter_finalize(thread_num, X_start, X_end)
+
+            # end: for X_chunk_idx
+
+            # Deallocating thread datastructures
+            self._on_X_parallel_finalize(thread_num)
+
+        # end: with nogil, parallel
+        return
+
+    @final
+    cdef void _parallel_on_Y(self) nogil:
+        """Computes the reduction of each vector (row) of X on Y
+        by parallelizing computation on chunks of Y.
+
+        This strategy dispatches chunks of Y uniformly on threads.
+        Each thread then iterates on all the chunks of X. This strategy is
+        embarrassingly parallel but uses intermediate datastructures
+        synchronisation.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            ITYPE_t num_threads = min(self.Y_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t thread_num
+
+        # Allocating datastructures
+        self._on_Y_init(num_threads)
+
+        for X_chunk_idx in range(self.X_n_chunks):
+            X_start = X_chunk_idx * self.X_n_samples_chunk
+            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_remainder > 0:
+                X_end = X_start + self.X_n_samples_remainder
+            else:
+                X_end = X_start + self.X_n_samples_chunk
+
+            with nogil, parallel(num_threads=num_threads):
+                thread_num = _openmp_thread_num()
+
+                # Initializing datastructures used in this thread
+                self._on_Y_parallel_init(thread_num)
+
+                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1 \
+                            and self.Y_n_samples_remainder > 0:
+                        Y_end = Y_start + self.Y_n_samples_remainder
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    self._compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+                # end: prange
+            # end: with nogil, parallel
+
+            # Synchronizing the thread datastructures with the main ones
+            self._on_Y_after_parallel(num_threads, X_start, X_end)
+
+        # end: for X_chunk_idx
+        # Deallocating temporary datastructures and adjusting main datastructures
+        self._on_Y_finalize(num_threads)
+        return
+
+    # Placeholder methods which have to be implemented
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Compute the pairwise distances on two chunks of X and Y and reduce them.
+
+        This is the core critical region of PairwiseDistanceReductions' computations
+        which must be implemented in subclasses.
+        """
+        return
+
+    def _finalize_results(self, bint return_distance):
+        """Call-back adapting datastructures before returning results.
+
+        This must be implemented in subclasses.
+        """
+        return None
+
+    # Placeholder methods which can be implemented
+
+    cdef void compute_exact_distances(self) nogil:
+        """Convert ranking-preserving distances to exact distances or recompute them."""
+        return
+
+    cdef void _on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Allocate datastructures used in a thread given its number."""
+        return
+
+    cdef void _on_X_prange_iter_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        """Initialise datastructures used in a thread given its number."""
+        return
+
+    cdef void _on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        """Interact with datastructures after a reduction on chunks."""
+        return
+
+    cdef void _on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
+    ) nogil:
+        """Interact with datastructures after executing all the reductions."""
+        return
+
+    cdef void _on_Y_init(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        """Allocate datastructures used in threads."""
+        return
+
+    cdef void _on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Initialise datastructures used in a thread given its number."""
+        return
+
+    cdef void _on_Y_after_parallel(
+        self,
+        ITYPE_t num_threads,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        """Interact with datastructures after a threads parallel region."""
+        return
+
+    cdef void _on_Y_finalize(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        """Interact with datastructures after executing all the reductions."""
+        return
+
+cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
+    """Computes the argkmin of vectors (rows) of a set of
+    vectors (rows) of X on another set of vectors (rows) of Y.
+
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The dataset pairs (X, Y) for the reduction.
+
+    k: int
+        The k for the argkmin reduction.
+
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use 256 if it is not set.
+
+    n_threads: int, default=None
+        The number of OpenMP threads to use for the reduction.
+        Parallelism is done on chunks and the sharding of chunks
+        depends on the `strategy` set on :method:`~ArgKmin.compute`.
+
+        None and -1 means using all processors.
+    """
+
+    cdef:
+        ITYPE_t k
+
+        ITYPE_t[:, ::1] argkmin_indices
+        DTYPE_t[:, ::1] argkmin_distances
+
+        # Used as array of pointers to private datastructures used in threads.
+        DTYPE_t ** heaps_r_distances_chunks
+        ITYPE_t ** heaps_indices_chunks
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        ITYPE_t k,
+        str metric="fast_euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        n_threads=None,
+    ) -> PairwiseDistancesArgKmin:
+        """Return the PairwiseDistancesArgKmin implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_X, d)
+            Input data.
+
+        Y : array-like of shape (n_Y, d)
+            Input data.
+
+        k : int
+            The k for the argkmin reduction.
+
+        metric : str, default='fast_euclidean'
+            The distance metric to use for argkmin. The default metric is
+            a fast implementation of the standard Euclidean metric.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        n_threads : int, default=None
+            The number of OpenMP threads to use for the reduction.
+            Parallelism is done on chunks and the sharding of chunks
+            depends on the `strategy` set on
+            :method:`~PairwiseDistancesArgKmin.compute`.
+
+            None and -1 means using all processors.
+
+        Returns
+        -------
+        argkmin: PairwiseDistancesArgKmin
+            The suited PairwiseDistancesArgKmin implementation.
+        """
+        # This factory comes to handle specialisations.
+        if metric in ("fast_euclidean", "fast_sqeuclidean") and not issparse(X) and not issparse(Y):
+            use_squared_distances = metric == "fast_sqeuclidean"
+            return FastEuclideanPairwiseDistancesArgKmin(
+                X=X, Y=Y, k=k,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size
+            )
+
+        return PairwiseDistancesArgKmin(
+            datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+            k=k,
+            chunk_size=chunk_size,
+        )
+
+    def __init__(
+        self,
+        DatasetsPair datasets_pair,
+        ITYPE_t k,
+        chunk_size=None,
+        n_threads=None,
+    ):
+        super().__init__(datasets_pair, chunk_size, n_threads)
+
+        check_scalar(k, "k", Integral, min_val=1)
+        self.k = k
+
+        # Allocating pointers to datastructures but not the datastructures themselves.
+        # There as many pointers as available threads.
+        # When reducing on small datasets, there can be more pointers than actual
+        # threads used for the reduction but there won't be allocated but unused
+        # datastructures.
+        self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.effective_omp_n_thread
+        )
+        self.heaps_indices_chunks = <ITYPE_t **> malloc(
+            sizeof(ITYPE_t *) * self.effective_omp_n_thread
+        )
+
+        # Main heaps used by PairwiseDistancesArgKmin.compute to return results.
+        self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
+        self.argkmin_distances = np.full((self.n_X, self.k), DBL_MAX, dtype=DTYPE)
+
+    def __dealloc__(self):
+        if self.heaps_indices_chunks is not NULL:
+            free(self.heaps_indices_chunks)
+
+        if self.heaps_r_distances_chunks is not NULL:
+            free(self.heaps_r_distances_chunks)
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_X = X_end - X_start
+            ITYPE_t n_Y = Y_end - Y_start
+            ITYPE_t k = self.k
+            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
+
+        # Pushing the distance and their associated indices on heaps
+        # which keep tracks of the argkmin.
+        for i in range(n_X):
+            for j in range(n_Y):
+                heap_push(
+                    heaps_r_distances + i * self.k,
+                    heaps_indices + i * self.k,
+                    k,
+                    self._datasets_pair.ranking_preserving_dist(X_start + i, Y_start + j),
+                    Y_start + j,
+                )
+
+    @final
+    cdef void _on_X_prange_iter_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        # As this strategy is embarrassingly parallel, we can set the
+        # thread heaps pointers to the proper position on the main heaps
+        self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
+        self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
+
+    @final
+    cdef void _on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx
+
+        # Sorting indices of the argkmin for each query vector of X
+        for idx in range(X_end - X_start):
+            simultaneous_sort(
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
+                self.k
+            )
+
+    cdef void _on_Y_init(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef:
+            # Maximum number of scalar elements (the last chunks can be smaller)
+            ITYPE_t heaps_size = self.X_n_samples_chunk * self.k
+            ITYPE_t thread_num
+
+        for thread_num in prange(num_threads, schedule='static', nogil=True,
+                                 num_threads=num_threads):
+            # As chunks of X are shared across threads, so must their
+            # heaps. To solve this, each thread has its own heaps
+            # which are then synchronised back in the main ones.
+            self.heaps_r_distances_chunks[thread_num] = <DTYPE_t *> malloc(
+                heaps_size * sizeof(DTYPE_t)
+            )
+            self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(
+                heaps_size * sizeof(ITYPE_t)
+            )
+
+    @final
+    cdef void _on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Initialising heaps (memset can't be used here)
+        for idx in range(self.X_n_samples_chunk * self.k):
+            self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX
+            self.heaps_indices_chunks[thread_num][idx] = -1
+
+    @final
+    cdef void _on_Y_after_parallel(
+        self,
+        ITYPE_t num_threads,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx, thread_num
+        with nogil, parallel(num_threads=self.effective_omp_n_thread):
+            # Synchronising the thread heaps with the main heaps
+            # This is done in parallel samples-wise (no need for locks)
+            for idx in prange(X_end - X_start, schedule="static"):
+                for thread_num in range(num_threads):
+                    for jdx in range(self.k):
+                        heap_push(
+                            &self.argkmin_distances[X_start + idx, 0],
+                            &self.argkmin_indices[X_start + idx, 0],
+                            self.k,
+                            self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx],
+                            self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
+                        )
+
+    cdef void _on_Y_finalize(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, thread_num
+
+        with nogil, parallel(num_threads=self.effective_omp_n_thread):
+            # Deallocating temporary datastructures
+            for thread_num in prange(num_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sort the main heaps into arrays in parallel
+            # in ascending order w.r.t the distances
+            for idx in prange(self.n_X, schedule='static'):
+                simultaneous_sort(
+                    &self.argkmin_distances[idx, 0],
+                    &self.argkmin_indices[idx, 0],
+                    self.k,
+                )
+        return
+
+    cdef void compute_exact_distances(self) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
+            DTYPE_t[:, ::1] distances = self.argkmin_distances
+        for i in prange(self.n_X, schedule='static', nogil=True,
+                        num_threads=self.effective_omp_n_thread):
+            for j in range(self.k):
+                distances[i, j] = self._datasets_pair.distance_metric._rdist_to_dist(
+                    # Guard against eventual -0., causing nan production.
+                    distances[i, j] if distances[i, j] > 0. else 0.
+                )
+
+    def _finalize_results(self, bint return_distance=False):
+        if return_distance:
+            # We eventually need to recompute distances because we relied on proxies.
+            self.compute_exact_distances()
+            return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
+
+        return np.asarray(self.argkmin_indices)
+
+
+cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin):
+    """Fast specialized alternative for PairwiseDistancesArgKmin on EuclideanDistance.
+
+    Notes
+    -----
+    This implementation has a superior arithmetic intensity and hence
+    better running time when the alternative is IO bound, but it can suffer
+    from numerical instability.
+
+    PairwiseDistancesArgKmin with EuclideanDistance must be used when higher
+    numerical precision is needed.
+    """
+
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+        const DTYPE_t[::1] X_sq_norms
+        const DTYPE_t[::1] Y_sq_norms
+
+        # Buffers for GEMM
+        DTYPE_t ** dist_middle_terms_chunks
+        bint use_squared_distances
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return (PairwiseDistancesArgKmin.is_usable_for(X, Y, metric) and
+                not _in_unstable_openblas_configuration())
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t k,
+        bint use_squared_distances=False,
+        chunk_size=None,
+    ):
+        super().__init__(
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+            k=k,
+            chunk_size=chunk_size,
+        )
+        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        cdef:
+            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
+        self.X, self.Y = datasets_pair.X, datasets_pair.Y
+        self.X_sq_norms = _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
+        self.Y_sq_norms = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
+        self.use_squared_distances = use_squared_distances
+
+        # Temporary datastructures used in threads
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.effective_omp_n_thread
+        )
+
+    def __dealloc__(self):
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        if not self.use_squared_distances:
+            PairwiseDistancesArgKmin.compute_exact_distances(self)
+
+    @final
+    cdef void _on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesArgKmin._on_X_parallel_init(self, thread_num)
+
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+        )
+
+    @final
+    cdef void _on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
+    ) nogil:
+        PairwiseDistancesArgKmin._on_X_parallel_finalize(self, thread_num)
+        free(self.dist_middle_terms_chunks[thread_num])
+
+    @final
+    cdef void _on_Y_init(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        PairwiseDistancesArgKmin._on_Y_init(self, num_threads)
+
+        for thread_num in range(num_threads):
+            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+            )
+
+    @final
+    cdef void _on_Y_finalize(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        PairwiseDistancesArgKmin._on_Y_finalize(self, num_threads)
+
+        for thread_num in range(num_threads):
+            free(self.dist_middle_terms_chunks[thread_num])
+
+    @final
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t k = self.k
+
+            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
+
+            # We compute the full pairwise squared distances matrix as follows
+            #
+            #      ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
+            #
+            # The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
+            #
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            DTYPE_t * A = <DTYPE_t*> & X_c[0, 0]
+            ITYPE_t lda = X_c.shape[1]
+            DTYPE_t * B = <DTYPE_t*> & Y_c[0, 0]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            DTYPE_t * C = dist_middle_terms
+            ITYPE_t ldc = Y_c.shape[0]
+
+        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+        # Pushing the distance and their associated indices on heaps
+        # which keep tracks of the argkmin.
+        for i in range(X_c.shape[0]):
+            for j in range(Y_c.shape[0]):
+                heap_push(
+                    heaps_r_distances + i * k,
+                    heaps_indices + i * k,
+                    k,
+                    # Using the squared euclidean distance as the ranking-preserving distance:
+                    # |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                    (
+                        self.X_sq_norms[i + X_start] +
+                        dist_middle_terms[i * Y_c.shape[0] + j] +
+                        self.Y_sq_norms[j + Y_start]
+                    ),
+                    j + Y_start,
+                )
+
+
+cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction):
+    """Returns radius-based neighbors vectors' indices in a dataset Y of
+    of vectors in a dataset X.
+
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The dataset pairs (X, Y) for the reduction.
+
+    radius: float
+        The radius defining the neighborhood.
+
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use 256 if it is not set.
+
+    n_threads: int, default=None
+        The number of OpenMP threads to use for the reduction.
+        Parallelism is done on chunks and the sharding of chunks
+        depends on the `strategy` set on
+        :method:`~PairwiseDistancesRadiusNeighborhood.compute`.
+
+        None and -1 means using all processors.
+    """
+
+    cdef:
+        DTYPE_t radius
+
+        # DistanceMetric compute ranking-preserving surrogate distance via rdist
+        # which are proxies necessitating less computations.
+        # We get the equivalent for the radius to be able to compare it against
+        # vectors' ranking-preserving surrogate distances.
+        DTYPE_t r_radius
+
+        # Neighbors indices and distances are returned as np.ndarray of np.ndarray.
+        #
+        # We want resizable buffers which we will to wrapped within numpy
+        # arrays at the end. std::vector comes as a handy interface for
+        # interacting efficiently with resizable buffers.
+        #
+        # Though it is possible to access their buffer address with
+        # std::vector::data, they can't be stolen: buffers lifetime
+        # is tight to their std::vector and are deallocated when
+        # std::vectors are.
+        #
+        # To solve this, we dynamically allocate std::vectors and then
+        # encapsulate them in a StdVectorSentinel responsible for
+        # freeing them when the associated np.ndarray is freed.
+        vector[vector[ITYPE_t]] * neigh_indices
+        vector[vector[DTYPE_t]] * neigh_distances
+
+        # Used as array of pointers to private datastructures used in threads.
+        vector[vector[ITYPE_t]] ** neigh_indices_chunks
+        vector[vector[DTYPE_t]] ** neigh_distances_chunks
+
+        bint sort_results
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        DTYPE_t radius,
+        str metric="fast_euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        n_threads=None,
+        bint sort_results=False,
+    ) -> PairwiseDistancesRadiusNeighborhood:
+        """Return the PairwiseDistancesRadiusNeighborhood implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_X, d)
+            Input data.
+
+        Y : array-like of shape (n_Y, d)
+            Input data.
+
+        radius : float
+            The radius defining the neighborhood.
+
+        metric : str, default='fast_euclidean'
+            The distance metric to use for argkmin. The default metric is
+            a fast implementation of the standard Euclidean metric.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        n_threads: int, default=None
+            The number of OpenMP threads to use for the reduction.
+            Parallelism is done on chunks and the sharding of chunks
+            depends on the `strategy` set on
+            :method:`~PairwiseDistancesRadiusNeighborhood.compute`.
+
+            None and -1 means using all processors.
+
+        sort_results : boolean, default=False
+            Sort results with respect to distances between each X vector and its
+            neighbors if set to True.
+
+        Returns
+        -------
+        radius_neighborhood: PairwiseDistancesRadiusNeighborhood
+            The suited PairwiseDistancesRadiusNeighborhood implementation.
+        """
+        # This factory comes to handle specialisations.
+        if metric in ("fast_euclidean", "fast_sqeuclidean") and not issparse(X) and not issparse(Y):
+            use_squared_distances = metric == "fast_sqeuclidean"
+            return FastEuclideanPairwiseDistancesRadiusNeighborhood(
+                X=X, Y=Y, radius=radius,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size,
+                sort_results=sort_results,
+            )
+
+        return PairwiseDistancesRadiusNeighborhood(
+            datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+            radius=radius,
+            chunk_size=chunk_size,
+            sort_results=sort_results,
+        )
+
+    def __init__(
+        self,
+        DatasetsPair datasets_pair,
+        DTYPE_t radius,
+        chunk_size=None,
+        n_threads=None,
+        sort_results=False
+    ):
+        super().__init__(datasets_pair, chunk_size, n_threads)
+
+        check_scalar(radius, "radius", Real, min_val=0)
+        self.radius = radius
+        self.r_radius = self._datasets_pair.distance_metric._dist_to_rdist(radius)
+        self.sort_results = sort_results
+
+        # Allocating pointers to datastructures but not the datastructures themselves.
+        # There as many pointers as available threads.
+        # When reducing on small datasets, there can be more pointers than actual
+        # threads used for the reduction but there won't be allocated but unused
+        # datastructures.
+        self.neigh_distances_chunks = <vector[vector[DTYPE_t]] **> malloc(
+            sizeof(self.neigh_distances) * self.effective_omp_n_thread
+        )
+        self.neigh_indices_chunks = <vector[vector[ITYPE_t]] **> malloc(
+            sizeof(self.neigh_indices) * self.effective_omp_n_thread
+        )
+
+        # Temporary datastructures which will be coerced to numpy arrays on before
+        # PairwiseDistancesRadiusNeighborhood.compute "return" and will be then freed.
+        self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X)
+        self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X)
+
+    def __dealloc__(self):
+        if self.neigh_distances_chunks is not NULL:
+            free(self.neigh_distances_chunks)
+
+        if self.neigh_indices_chunks is not NULL:
+            free(self.neigh_indices_chunks)
+
+        if self.neigh_indices is not NULL:
+            del self.neigh_indices
+
+        if self.neigh_distances is not NULL:
+            del self.neigh_distances
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t r_dist_i_j
+
+        for i in range(X_start, X_end):
+            for j in range(Y_start, Y_end):
+                r_dist_i_j = self._datasets_pair.ranking_preserving_dist(i, j)
+                if r_dist_i_j <= self.r_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[i].push_back(j)
+
+    def _finalize_results(self, bint return_distance=False):
+        if return_distance:
+            self.compute_exact_distances()
+            return (
+                coerce_vectors_to_nd_arrays(self.neigh_distances),
+                coerce_vectors_to_nd_arrays(self.neigh_indices),
+            )
+
+        return coerce_vectors_to_nd_arrays(self.neigh_indices)
+
+    @final
+    cdef void _on_X_prange_iter_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+
+        # As this strategy is embarrassingly parallel, we can set the
+        # thread vectors' pointers to the main vectors'.
+        self.neigh_distances_chunks[thread_num] = self.neigh_distances
+        self.neigh_indices_chunks[thread_num] = self.neigh_indices
+
+    @final
+    cdef void _on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx
+
+        # Sorting neighbors for each query vector of X
+        if self.sort_results:
+            for idx in range(X_start, X_end):
+                simultaneous_sort(
+                    deref(self.neigh_distances)[idx].data(),
+                    deref(self.neigh_indices)[idx].data(),
+                    deref(self.neigh_indices)[idx].size()
+                )
+
+    cdef void _on_Y_init(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef:
+            ITYPE_t thread_num
+        # As chunks of X are shared across threads, so must datastructures
+        # to avoid race conditions.
+        # Each thread has its own vectors of n_X vectors which are then merged
+        # back in the main n_X vectors.
+        for thread_num in range(num_threads):
+            self.neigh_distances_chunks[thread_num] = new vector[vector[DTYPE_t]](self.n_X)
+            self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_X)
+
+    @final
+    cdef void _merge_vectors(
+        self,
+        ITYPE_t idx,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef:
+            ITYPE_t thread_num
+            ITYPE_t idx_n_elements = 0
+            ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size()
+
+        # Resizing buffers only once for the given
+        for thread_num in range(num_threads):
+            idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size()
+
+        deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements)
+        deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements)
+
+        # Moving the elements by range using the range first element
+        # as the reference for the insertion
+        for thread_num in range(num_threads):
+            move(
+                deref(self.neigh_distances_chunks[thread_num])[idx].begin(),
+                deref(self.neigh_distances_chunks[thread_num])[idx].end(),
+                deref(self.neigh_distances)[idx].begin() + last_element_idx
+            )
+            move(
+                deref(self.neigh_indices_chunks[thread_num])[idx].begin(),
+                deref(self.neigh_indices_chunks[thread_num])[idx].end(),
+                deref(self.neigh_indices)[idx].begin() + last_element_idx
+            )
+            last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size()
+
+
+    cdef void _on_Y_finalize(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current
+
+        with nogil, parallel(num_threads=self.effective_omp_n_thread):
+            # Merge vectors used in threads into the main ones.
+            # This is done in parallel sample-wise (no need for locks)
+            # using dynamic scheduling because we generally do not have
+            # the same number of neighbors for each query vectors.
+            # TODO: compare 'dynamic' vs 'static' vs 'guided'
+            for idx in prange(self.n_X, schedule='dynamic'):
+                self._merge_vectors(idx, num_threads)
+
+            # The content of the vector have been std::moved,
+            # Hence they can't be used anymore and can only be deleted.
+            for thread_num in prange(num_threads, schedule='static'):
+                del self.neigh_distances_chunks[thread_num]
+                del self.neigh_indices_chunks[thread_num]
+
+            # Sort in parallel in ascending order w.r.t the distances if needed
+            if self.sort_results:
+                for idx in prange(self.n_X, schedule='static'):
+                    simultaneous_sort(
+                        deref(self.neigh_distances)[idx].data(),
+                        deref(self.neigh_indices)[idx].data(),
+                        deref(self.neigh_indices)[idx].size()
+                    )
+
+        return
+
+    cdef void compute_exact_distances(self) nogil:
+        """Convert ranking-preserving distances to pairwise distances in parallel."""
+        cdef:
+            ITYPE_t i, j
+
+        for i in prange(self.n_X, nogil=True, schedule='static',
+                        num_threads=self.effective_omp_n_thread):
+            for j in range(deref(self.neigh_indices)[i].size()):
+                deref(self.neigh_distances)[i][j] = (
+                        self._datasets_pair.distance_metric._rdist_to_dist(
+                            # Guard against eventual -0., causing nan production.
+                            deref(self.neigh_distances)[i][j]
+                            if deref(self.neigh_distances)[i][j] > 0.
+                            else 0
+                        )
+                )
+
+    @final
+    def compute(
+        self,
+        str strategy=None,
+        bint return_distance=False,
+    ):
+        if self.sort_results and not return_distance:
+            raise ValueError("return_distance must be True if sort_results is True.")
+
+        return super().compute(strategy=strategy, return_distance=return_distance)
+
+
+cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRadiusNeighborhood):
+    """Fast specialized alternative for PairwiseDistancesRadiusNeighborhood on EuclideanDistance.
+
+    Notes
+    -----
+    This implementation has a superior arithmetic intensity and hence
+    better running time when the alternative is IO bound, but it can suffer
+    from numerical instability.
+
+    RadiusNeighborhood with EuclideanDistance must be used when higher
+    numerical precision is needed.
+    """
+
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+        const DTYPE_t[::1] X_sq_norms
+        const DTYPE_t[::1] Y_sq_norms
+
+        # Buffers for GEMM
+        DTYPE_t ** dist_middle_terms_chunks
+        bint use_squared_distances
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return (PairwiseDistancesRadiusNeighborhood.is_usable_for(X, Y, metric)
+                and not _in_unstable_openblas_configuration())
+
+    def __init__(
+        self,
+        X,
+        Y,
+        DTYPE_t radius,
+        bint use_squared_distances=False,
+        chunk_size=None,
+        sort_results=False,
+    ):
+        super().__init__(
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+            radius=radius,
+            chunk_size=chunk_size,
+            sort_results=sort_results,
+        )
+        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        cdef:
+            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
+        self.X, self.Y = datasets_pair.X, datasets_pair.Y
+        self.X_sq_norms = _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
+        self.Y_sq_norms = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
+        self.use_squared_distances = use_squared_distances
+
+        if use_squared_distances:
+            # In this specialisation and this setup, the value passed to the radius is
+            # already considered to be the adapted radius, so we overwrite it.
+            self.r_radius = radius
+
+        # Temporary datastructures used in threads
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.effective_omp_n_thread
+        )
+
+    def __dealloc__(self):
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+
+    @final
+    cdef void compute_exact_distances(self) nogil:
+        if not self.use_squared_distances:
+            PairwiseDistancesRadiusNeighborhood.compute_exact_distances(self)
+
+    @final
+    cdef void _on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood._on_X_parallel_init(self, thread_num)
+
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+        )
+
+    @final
+    cdef void _on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
+    ) nogil:
+        PairwiseDistancesRadiusNeighborhood._on_X_parallel_finalize(self, thread_num)
+        free(self.dist_middle_terms_chunks[thread_num])
+
+    @final
+    cdef void _on_Y_init(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        PairwiseDistancesRadiusNeighborhood._on_Y_init(self, num_threads)
+
+        for thread_num in range(num_threads):
+            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+            )
+
+    @final
+    cdef void _on_Y_finalize(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        PairwiseDistancesRadiusNeighborhood._on_Y_finalize(self, num_threads)
+
+        for thread_num in range(num_threads):
+            free(self.dist_middle_terms_chunks[thread_num])
+
+    @final
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t squared_dist_i_j
+
+            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+
+            # We compute the full pairwise squared distances matrix as follows
+            #
+            #      ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
+            #
+            # The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
+            #
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            DTYPE_t * A = <DTYPE_t*> & X_c[0, 0]
+            ITYPE_t lda = X_c.shape[1]
+            DTYPE_t * B = <DTYPE_t*> & Y_c[0, 0]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            DTYPE_t * C = dist_middle_terms
+            ITYPE_t ldc = Y_c.shape[0]
+
+        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+        # Pushing the distance and their associated indices in vectors.
+        for i in range(X_c.shape[0]):
+            for j in range(Y_c.shape[0]):
+                # Using the squared euclidean distance as the ranking-preserving distance:
+                # |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                squared_dist_i_j = (
+                    self.X_sq_norms[i + X_start]
+                    + dist_middle_terms[i * Y_c.shape[0] + j]
+                    + self.Y_sq_norms[j + Y_start]
+                )
+                if squared_dist_i_j <= self.r_radius:
+                    deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j)
+                    deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start)
+
+
+cdef class Kernel(PairwiseDistancesReduction):
+
+    cdef:
+        DTYPE_t[:, ::1] K
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        str kernel="rbf",
+        chunk_size=None,
+        dict kernel_kwargs=None,
+        n_threads=None,
+    ) -> PairwiseDistancesArgKmin:
+        """Return the Kernel implementation for the given arguments.
+
+        Parameters
+        ----------
+        kernel : str, default='rbf'
+            The kernel to use.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        kernel_kwargs : dict, default=None
+            Keyword arguments to pass to specified kernel.
+
+        n_threads : int, default=None
+            The number of OpenMP threads to use for the reduction.
+            Parallelism is done on chunks and the sharding of chunks
+            depends on the `strategy` set on
+            :method:`~Kernel.compute`.
+
+            None and -1 means using all processors.
+
+        Returns
+        -------
+        argkmin: PairwiseDistancesArgKmin
+            The suited PairwiseDistancesArgKmin implementation.
+        """
+        # This factory comes to handle specialisations.
+        if kernel == "rbf":
+            return RBFKernel(X, Y, chunk_size=chunk_size)
+        else:
+            raise ValueError(f"Unsupported kernel: {kernel}")
+
+    def __init__(
+        self,
+        DatasetsPair datasets_pair,
+        chunk_size=None,
+        n_threads=None,
+    ):
+        super().__init__(datasets_pair, chunk_size, n_threads)
+
+        # The Gram matrix: K[i,j] = K(x_i, y_j)
+        self.K = np.empty((self.n_X, self.n_Y), dtype=DTYPE)
+
+    def compute(
+        self,
+        str strategy=None,
+    ):
+        """Computes the kernel between vectors of X and Y.
+
+        Parameters
+        ----------
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            Strategies differs on the dispatching they use for chunks on threads:
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+              Each thread then iterates on all the chunks of Y. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation.
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+              Each thread then iterates on all the chunks of X. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation.
+              - 'auto' relies on a simple heuristic to choose between
+              'parallel_on_X' and 'parallel_on_Y'.
+              - None (default) looks-up in scikit-learn configuration for
+              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        Returns
+        -------
+        K : ndarray of shape (n_X, n_Y)
+            A kernel matrix K such that K_{i, j} is the kernel between the
+            ith and jth vectors of the given matrix X and Y.
+        """
+
+        if strategy is None:
+            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X:
+                strategy = 'parallel_on_X'
+            else:
+                strategy = 'parallel_on_Y'
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if strategy == 'parallel_on_Y':
+                self._parallel_on_Y()
+            elif strategy == 'parallel_on_X':
+                self._parallel_on_X()
+            else:
+                raise RuntimeError(f"strategy '{strategy}' not supported.")
+
+        return self._finalize_results()
+
+cdef class RBFKernel(Kernel):
+
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+        const DTYPE_t[::1] X_sq_norms
+        const DTYPE_t[::1] Y_sq_norms
+
+        # Buffers for GEMM
+        DTYPE_t ** dist_middle_terms_chunks
+        bint use_squared_distances
+
+        DTYPE_t gamma
+
+    def __init__(
+        self,
+        X,
+        Y,
+        gamma=None,
+        chunk_size=None,
+        n_threads=None,
+    ):
+        super().__init__(
+            # The datasets pair here is used for exact distances computations
+            datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"),
+            chunk_size=chunk_size,
+            n_threads=n_threads
+        )
+        # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair
+        cdef:
+            DenseDenseDatasetsPair datasets_pair = <DenseDenseDatasetsPair> self.datasets_pair
+        self.X, self.Y = datasets_pair.X, datasets_pair.Y
+        self.X_sq_norms = _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread)
+        self.Y_sq_norms = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread)
+
+        # Temporary datastructures used in threads
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.effective_omp_n_thread
+        )
+
+        self.gamma = 1.0 / X.shape[1] if gamma is None else gamma
+
+
+    def __dealloc__(self):
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        return (super().is_usable_for(X, Y, metric)
+                and not _in_unstable_openblas_configuration())
+
+    @final
+    cdef void _on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        Kernel._on_X_parallel_init(self, thread_num)
+
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+        )
+
+    @final
+    cdef void _on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
+    ) nogil:
+        Kernel._on_X_parallel_finalize(self, thread_num)
+        free(self.dist_middle_terms_chunks[thread_num])
+
+    @final
+    cdef void _on_Y_init(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        Kernel._on_Y_init(self, num_threads)
+
+        for thread_num in range(num_threads):
+            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+                self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t)
+            )
+
+    @final
+    cdef void _on_Y_finalize(
+        self,
+        ITYPE_t num_threads,
+    ) nogil:
+        cdef ITYPE_t thread_num
+        Kernel._on_Y_finalize(self, num_threads)
+
+        for thread_num in range(num_threads):
+            free(self.dist_middle_terms_chunks[thread_num])
+
+    @final
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t squared_dist_i_j
+
+            const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+
+            # We compute the full pairwise squared distances matrix as follows
+            #
+            #      exp(- gamma ||X_c - Y_c||²) = exp(- gamma( ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²) )
+            #
+            # The middle term gets computed efficiently bellow using BLAS Level 3 GEMM.
+            #
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            DTYPE_t * A = <DTYPE_t*> & X_c[0, 0]
+            ITYPE_t lda = X_c.shape[1]
+            DTYPE_t * B = <DTYPE_t*> & Y_c[0, 0]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            DTYPE_t * C = dist_middle_terms
+            ITYPE_t ldc = Y_c.shape[0]
+
+        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+        # Pushing the distance and their associated indices in vectors.
+        for i in range(X_c.shape[0]):
+            for j in range(Y_c.shape[0]):
+                # Using the squared euclidean distance as the ranking-preserving distance:
+                # |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                squared_dist_i_j = (
+                    self.X_sq_norms[i + X_start]
+                    + dist_middle_terms[i * Y_c.shape[0] + j]
+                    + self.Y_sq_norms[j + Y_start]
+                )
+                self.K[i + X_start, j + Y_start] = - self.gamma * squared_dist_i_j
+
+
+    def _finalize_results(self):
+        return np.exp(self.K)
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index d493ad68603ea..7a60bbb0b4ef1 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,6 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
+from ._pairwise_distances_reduction import PairwiseDistancesArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
@@ -574,6 +575,10 @@ def _argmin_min_reduce(dist, start):
     return indices, values
 
 
+def _argmin_reduce(dist, start):
+    return dist.argmin(axis=1)
+
+
 def pairwise_distances_argmin_min(
     X, Y, *, axis=1, metric="euclidean", metric_kwargs=None
 ):
@@ -646,19 +651,33 @@ def pairwise_distances_argmin_min(
     """
     X, Y = check_pairwise_arrays(X, Y)
 
-    if metric_kwargs is None:
-        metric_kwargs = {}
-
     if axis == 0:
         X, Y = Y, X
 
-    indices, values = zip(
-        *pairwise_distances_chunked(
-            X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
+    if metric_kwargs is None:
+        metric_kwargs = {}
+
+    if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric):
+        values, indices = PairwiseDistancesArgKmin.get_for(
+            X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
+        ).compute(strategy="auto", return_distance=True)
+        values = values.flatten()
+        indices = indices.flatten()
+    else:
+        # TODO: once ArgKmin supports sparse input matrices and 32 bit,
+        # we won't need to fallback to pairwise_distances_chunked anymore.
+        # When PairwiseDistancesArgKmin is not supported and when the user
+        # asked for a fast alternative, we need to revert to the standard one.
+        if metric == "fast_euclidean":
+            metric = "euclidean"
+
+        indices, values = zip(
+            *pairwise_distances_chunked(
+                X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
+            )
         )
-    )
-    indices = np.concatenate(indices)
-    values = np.concatenate(values)
+        indices = np.concatenate(indices)
+        values = np.concatenate(values)
 
     return indices, values
 
@@ -730,9 +749,38 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs
     if metric_kwargs is None:
         metric_kwargs = {}
 
-    return pairwise_distances_argmin_min(
-        X, Y, axis=axis, metric=metric, metric_kwargs=metric_kwargs
-    )[0]
+    X, Y = check_pairwise_arrays(X, Y)
+
+    if axis == 0:
+        X, Y = Y, X
+
+    if metric_kwargs is None:
+        metric_kwargs = {}
+
+    if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric):
+        indices = PairwiseDistancesArgKmin.get_for(
+            X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
+        ).compute(strategy="auto", return_distance=False)
+        indices = indices.flatten()
+    else:
+        # TODO: once ArgKmin supports sparse input matrices and 32 bit,
+        # we won't need to fallback to pairwise_distances_chunked anymore.
+        # When PairwiseDistancesArgKmin is not supported and when the user
+        # asked for a fast alternative, we need to revert to the standard one.
+        if metric == "fast_euclidean":
+            metric = "euclidean"
+
+        indices = np.concatenate(
+            list(
+                # This returns a np.ndarray generator whose arrays we need
+                # to flatten into one.
+                pairwise_distances_chunked(
+                    X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs
+                )
+            )
+        )
+
+    return indices
 
 
 def haversine_distances(X, Y=None):
@@ -780,7 +828,7 @@ def haversine_distances(X, Y=None):
     array([[    0.        , 11099.54035582],
            [11099.54035582,     0.        ]])
     """
-    from ..neighbors import DistanceMetric
+    from ..metrics import DistanceMetric
 
     return DistanceMetric.get_metric("haversine").pairwise(X, Y)
 
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index df1a1caad17e0..cd32817574dd3 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -1,4 +1,5 @@
 import os
+import numpy as np
 
 from numpy.distutils.misc_util import Configuration
 
@@ -18,6 +19,20 @@ def configuration(parent_package="", top_path=None):
         "_pairwise_fast", sources=["_pairwise_fast.pyx"], libraries=libraries
     )
 
+    config.add_extension(
+        "_pairwise_distances_reduction",
+        sources=["_pairwise_distances_reduction.pyx"],
+        language="c++",
+        libraries=libraries,
+    )
+
+    config.add_extension(
+        "_dist_metrics",
+        sources=["_dist_metrics.pyx"],
+        include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")],
+        libraries=libraries,
+    )
+
     config.add_subpackage("tests")
 
     return config
diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
similarity index 93%
rename from sklearn/neighbors/tests/test_dist_metrics.py
rename to sklearn/metrics/tests/test_dist_metrics.py
index 08298f087c216..9f0750fd75669 100644
--- a/sklearn/neighbors/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -7,8 +7,8 @@
 import pytest
 
 from scipy.spatial.distance import cdist
-from sklearn.neighbors import DistanceMetric
-from sklearn.neighbors import BallTree
+from sklearn.metrics import DistanceMetric
+from sklearn.metrics._dist_metrics import BOOL_METRICS
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import create_memmap_backed_data
 from sklearn.utils.fixes import sp_version, parse_version
@@ -37,16 +37,6 @@ def dist_func(x1, x2, p):
 V = rng.random_sample((d, d))
 VI = np.dot(V, V.T)
 
-BOOL_METRICS = [
-    "matching",
-    "jaccard",
-    "dice",
-    "kulsinski",
-    "rogerstanimoto",
-    "russellrao",
-    "sokalmichener",
-    "sokalsneath",
-]
 
 METRICS_DEFAULT_PARAMS = {
     "euclidean": {},
@@ -62,6 +52,16 @@ def dist_func(x1, x2, p):
 }
 
 
+# TODO: remove this test in 1.2
+def test_neighbors_distance_metric_deprecation():
+    from sklearn.neighbors import DistanceMetric as DeprecatedDistanceMetric
+
+    with pytest.warns(
+        FutureWarning, match="sklearn.neighbors.DistanceMetric has been moved"
+    ):
+        DeprecatedDistanceMetric.get_metric("euclidean")
+
+
 @pytest.mark.parametrize("metric", METRICS_DEFAULT_PARAMS)
 @pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)])
 def test_cdist(metric, X1, X2):
@@ -230,16 +230,6 @@ def test_pyfunc_metric():
     assert_array_almost_equal(D1_pkl, D2_pkl)
 
 
-def test_bad_pyfunc_metric():
-    def wrong_distance(x, y):
-        return "1"
-
-    X = np.ones((5, 2))
-    msg = "Custom distance function must accept two vectors"
-    with pytest.raises(TypeError, match=msg):
-        BallTree(X, metric=wrong_distance)
-
-
 def test_input_data_size():
     # Regression test for #6288
     # Previously, a metric requiring a particular input dimension would fail
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index b7e90e63f2af1..90b8db305b83b 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -440,7 +440,6 @@ def test_pairwise_distances_argmin_min():
 
     expected_idx = [0, 1]
     expected_vals = [2, 2]
-    expected_vals_sq = [4, 4]
 
     # euclidean metric
     idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean")
@@ -458,10 +457,12 @@ def test_pairwise_distances_argmin_min():
 
     # euclidean metric squared
     idx, vals = pairwise_distances_argmin_min(
-        X, Y, metric="euclidean", metric_kwargs={"squared": True}
+        X,
+        Y,
+        metric="fast_euclidean",
     )
     assert_array_almost_equal(idx, expected_idx)
-    assert_array_almost_equal(vals, expected_vals_sq)
+    assert_array_almost_equal(vals, expected_vals)
 
     # Non-euclidean scikit-learn metric
     idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan")
@@ -1464,3 +1465,34 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
     # and fails due to rounding errors
     rtol = 1e-5 if dtype is np.float32 else 1e-7
     assert_allclose(dist, expected_dist, rtol=rtol)
+
+
+@pytest.mark.parametrize("X_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
+@pytest.mark.parametrize("Y_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
+@pytest.mark.parametrize("sign", [1, -1])
+def test_fast_euclidean_correctness(
+    X_translation, Y_translation, sign, n_samples=10000, n_features=10
+):
+    # This is the only failing test case, so we prefer xfailing.
+    numerical_edge_cases = {(1e6, 1e6, 1), (1e7, 1e7, 1)}
+    if (X_translation, Y_translation, sign) in numerical_edge_cases:
+        pytest.xfail(
+            "Numerical edge-case: (X_translation, Y_translation,"
+            f" sign)={(X_translation, Y_translation, sign)}"
+        )
+
+    # The fast squared euclidean strategy must return results
+    # that are close to the ones obtained with the euclidean distance
+    rng = np.random.RandomState(1)
+
+    spread = 100
+    X = X_translation + rng.rand(n_samples, n_features) * spread
+    Y = (Y_translation + rng.rand(n_samples, n_features) * spread) * sign
+
+    argmins, distances = pairwise_distances_argmin_min(X, Y, metric="euclidean")
+    fsq_argmins, fsq_distances = pairwise_distances_argmin_min(
+        X, Y, metric="fast_euclidean"
+    )
+
+    np.testing.assert_array_equal(argmins, fsq_argmins)
+    np.testing.assert_allclose(distances, fsq_distances, rtol=1e-5)
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
new file mode 100644
index 0000000000000..e68f66fe5a40c
--- /dev/null
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -0,0 +1,469 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal, assert_allclose
+from scipy.sparse import csr_matrix
+
+from sklearn.metrics._dist_metrics import (
+    DenseDenseDatasetsPair,
+    DenseSparseDatasetsPair,
+    SparseDenseDatasetsPair,
+    SparseSparseDatasetsPair,
+)
+
+from sklearn.metrics._pairwise_distances_reduction import (
+    PairwiseDistancesReduction,
+    PairwiseDistancesArgKmin,
+    PairwiseDistancesRadiusNeighborhood,
+    FastEuclideanPairwiseDistancesArgKmin,
+    FastEuclideanPairwiseDistancesRadiusNeighborhood,
+    _sqeuclidean_row_norms,
+)
+
+from sklearn.utils import _in_unstable_openblas_configuration
+from sklearn.utils._testing import (
+    fails_if_unstable_openblas,
+    get_dummy_metric_kwargs,
+)
+
+
+def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices):
+    # We get arrays of arrays and we need to check for individual pairs
+    for i in range(ref_dist.shape[0]):
+        assert_array_equal(
+            ref_indices[i],
+            indices[i],
+            err_msg=f"Query vector #{i} has different neighbors' indices",
+        )
+        assert_allclose(
+            ref_dist[i],
+            dist[i],
+            err_msg=f"Query vector #{i} has different neighbors' distances",
+            rtol=1e-7,
+        )
+
+
+def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
+    assert_array_equal(
+        ref_indices,
+        indices,
+        err_msg="Query vectors have different neighbors' indices",
+    )
+    assert_allclose(
+        ref_dist,
+        dist,
+        err_msg="Query vectors have different neighbors' distances",
+        rtol=1e-7,
+    )
+
+
+ASSERT_RESULT = {
+    PairwiseDistancesArgKmin: assert_argkmin_results_equality,
+    PairwiseDistancesRadiusNeighborhood: assert_radius_neighborhood_results_equality,
+}
+
+
+def test_pairwise_distances_reduction_is_usable_for():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    metric = "euclidean"
+    assert PairwiseDistancesReduction.is_usable_for(X, Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(
+        X.astype(np.int64), Y.astype(np.int64), metric
+    )
+
+    assert not PairwiseDistancesReduction.is_usable_for(X[0], Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(X, Y[0], metric)
+
+    assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc")
+    # TODO: remove once 32 bits datasets are supported
+    assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(X, Y.astype(np.int32), metric)
+
+    # TODO: remove once sparse matrices are supported
+    assert not PairwiseDistancesReduction.is_usable_for(csr_matrix(X), Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(X, csr_matrix(Y), metric)
+
+
+def test_argkmin_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    k = 5
+    metric = "euclidean"
+
+    with pytest.raises(
+        ValueError, match="Only 64bit float datasets are supported for X and Y."
+    ):
+        PairwiseDistancesArgKmin.get_for(
+            X=X.astype(np.float32), Y=Y, k=k, metric=metric
+        )
+
+    with pytest.raises(
+        ValueError, match="Only 64bit float datasets are supported for X and Y."
+    ):
+        PairwiseDistancesArgKmin.get_for(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
+
+    with pytest.raises(ValueError, match="k == -1, must be >= 1."):
+        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=-1, metric=metric)
+
+    with pytest.raises(ValueError, match="k == 0, must be >= 1."):
+        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=0, metric=metric)
+
+    with pytest.raises(ValueError, match="Unrecognized metric"):
+        PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=k, metric="wrong metric")
+
+    with pytest.raises(
+        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
+    ):
+        PairwiseDistancesArgKmin.get_for(
+            X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric
+        )
+
+    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
+        PairwiseDistancesArgKmin.get_for(
+            X=np.asfortranarray(X), Y=Y, k=k, metric=metric
+        )
+
+
+def test_radius_neighborhood_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    radius = 5
+    metric = "euclidean"
+
+    with pytest.raises(
+        ValueError, match="Only 64bit float datasets are supported for X and Y."
+    ):
+        PairwiseDistancesRadiusNeighborhood.get_for(
+            X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
+        )
+
+    with pytest.raises(
+        ValueError, match="Only 64bit float datasets are supported for X and Y."
+    ):
+        PairwiseDistancesRadiusNeighborhood.get_for(
+            X=X, Y=Y.astype(np.int32), radius=radius, metric=metric
+        )
+
+    with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
+        PairwiseDistancesRadiusNeighborhood.get_for(X=X, Y=Y, radius=-1, metric=metric)
+
+    with pytest.raises(ValueError, match="Unrecognized metric"):
+        PairwiseDistancesRadiusNeighborhood.get_for(
+            X=X, Y=Y, radius=radius, metric="wrong metric"
+        )
+
+    with pytest.raises(
+        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
+    ):
+        PairwiseDistancesRadiusNeighborhood.get_for(
+            X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
+        )
+
+    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
+        PairwiseDistancesRadiusNeighborhood.get_for(
+            X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric
+        )
+
+
+@fails_if_unstable_openblas
+@pytest.mark.filterwarnings("ignore:Constructing a DIA matrix")
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction, FastPairwiseDistancesReduction",
+    [
+        (PairwiseDistancesArgKmin, FastEuclideanPairwiseDistancesArgKmin),
+        (
+            PairwiseDistancesRadiusNeighborhood,
+            FastEuclideanPairwiseDistancesRadiusNeighborhood,
+        ),
+    ],
+)
+def test_pairwise_distances_reduction_factory_method(
+    PairwiseDistancesReduction, FastPairwiseDistancesReduction
+):
+    # Test all the combinations of DatasetsPair for creation
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    metric = "euclidean"
+
+    # Dummy value for k or radius
+    dummy_arg = 5
+
+    dense_dense_instance = PairwiseDistancesReduction.get_for(X, Y, dummy_arg, metric)
+    assert isinstance(dense_dense_instance.datasets_pair, DenseDenseDatasetsPair)
+
+    sparse_sparse_instance = PairwiseDistancesReduction.get_for(
+        csr_matrix(X), csr_matrix(Y), dummy_arg, metric
+    )
+    assert isinstance(sparse_sparse_instance.datasets_pair, SparseSparseDatasetsPair)
+
+    dense_sparse_instance = PairwiseDistancesReduction.get_for(
+        X, csr_matrix(Y), dummy_arg, metric=metric
+    )
+    assert isinstance(dense_sparse_instance.datasets_pair, DenseSparseDatasetsPair)
+
+    sparse_dense_instance = PairwiseDistancesReduction.get_for(
+        csr_matrix(X), Y, dummy_arg, metric=metric
+    )
+    assert isinstance(sparse_dense_instance.datasets_pair, SparseDenseDatasetsPair)
+
+    # Test specialisations creation
+    fast_euclidean_instance = PairwiseDistancesReduction.get_for(
+        X, Y, dummy_arg, metric="fast_euclidean"
+    )
+    assert isinstance(fast_euclidean_instance, PairwiseDistancesReduction)
+    assert isinstance(fast_euclidean_instance, FastPairwiseDistancesReduction)
+
+
+@fails_if_unstable_openblas
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("chunk_size", [50, 512, 1024])
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+)
+def test_chunk_size_agnosticism(
+    PairwiseDistancesReduction,
+    seed,
+    n_samples,
+    chunk_size,
+    metric="fast_euclidean",
+    n_features=100,
+    dtype=np.float64,
+):
+    # Results should not depend on the chunk size
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    parameter = (
+        10
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
+        # Scaling the radius with the dimensions
+        else 10 ** np.log(n_features)
+    )
+
+    ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
+        X, Y, parameter, metric="euclidean"
+    ).compute(return_distance=True)
+
+    dist, indices = PairwiseDistancesReduction.get_for(
+        X, Y, parameter, metric=metric, chunk_size=chunk_size
+    ).compute(return_distance=True)
+
+    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
+
+
+@fails_if_unstable_openblas
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("chunk_size", [50, 512, 1024])
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+)
+def test_n_threads_agnosticism(
+    PairwiseDistancesReduction,
+    seed,
+    n_samples,
+    chunk_size,
+    metric="fast_euclidean",
+    n_features=100,
+    dtype=np.float64,
+):
+    # Results should not depend on the number of threads
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    parameter = (
+        10
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
+        # Scaling the radius with the dimensions
+        else 10 ** np.log(n_features)
+    )
+
+    ref_dist, ref_indices = PairwiseDistancesReduction.get_for(
+        X, Y, parameter, metric="euclidean"
+    ).compute(return_distance=True)
+
+    dist, indices = PairwiseDistancesReduction.get_for(
+        X, Y, parameter, metric=metric, n_threads=1
+    ).compute(return_distance=True)
+
+    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
+
+
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
+)
+def test_strategies_consistency(
+    PairwiseDistancesReduction,
+    metric,
+    n_samples,
+    seed,
+    n_features=10,
+    dtype=np.float64,
+):
+    # Results obtained using both parallelization strategies must be identical
+    if _in_unstable_openblas_configuration() and metric == {
+        "fast_sqeuclidean",
+        "fast_euclidean",
+    }:
+        pytest.xfail(
+            "OpenBLAS (used for 'fast_(sq)euclidean') is unstable in this configuration"
+        )
+
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        X = np.ascontiguousarray(X[:, :2])
+        Y = np.ascontiguousarray(Y[:, :2])
+
+    parameter = (
+        10
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
+        # Scaling the radius with the dimensions
+        else 10 ** np.log(n_features)
+    )
+
+    pairwise_distances_reduction = PairwiseDistancesReduction.get_for(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        metric_kwargs=get_dummy_metric_kwargs(metric, n_features),
+        # To be sure to use parallelization
+        chunk_size=n_samples // 4,
+    )
+
+    dist_par_X, indices_par_X = pairwise_distances_reduction.compute(
+        strategy="parallel_on_X", return_distance=True
+    )
+
+    dist_par_Y, indices_par_Y = pairwise_distances_reduction.compute(
+        strategy="parallel_on_Y", return_distance=True
+    )
+
+    ASSERT_RESULT[PairwiseDistancesReduction](
+        dist_par_X, dist_par_Y, indices_par_X, indices_par_Y
+    )
+
+
+@fails_if_unstable_openblas
+@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("k, radius", [(50, 100)])
+def test_fast_sqeuclidean_correctness(
+    seed,
+    n_samples,
+    n_features,
+    k,
+    radius,
+    dtype=np.float64,
+):
+    # The fast squared euclidean strategy must return results
+    # that are close to the ones obtained with the euclidean distance
+    if n_samples < k:
+        pytest.skip(
+            f"Skipping as n_samples (={n_samples}) < k (={k})",
+            allow_module_level=True,
+        )
+
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    eucl_dist, eucl_indices = PairwiseDistancesArgKmin.get_for(
+        X, Y, k, metric="euclidean"
+    ).compute(return_distance=True)
+    fse_dist, fse_indices = PairwiseDistancesArgKmin.get_for(
+        X, Y, k, metric="fast_euclidean"
+    ).compute(return_distance=True)
+
+    assert_argkmin_results_equality(eucl_dist, fse_dist, eucl_indices, fse_indices)
+
+    eucl_dist, eucl_indices = PairwiseDistancesRadiusNeighborhood.get_for(
+        X, Y, radius, metric="euclidean"
+    ).compute(return_distance=True)
+    fse_dist, fse_indices = PairwiseDistancesRadiusNeighborhood.get_for(
+        X, Y, radius, metric="fast_euclidean"
+    ).compute(return_distance=True)
+
+    assert_radius_neighborhood_results_equality(
+        eucl_dist, fse_dist, eucl_indices, fse_indices
+    )
+
+
+@fails_if_unstable_openblas
+@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("k", [1, 10, 100])
+@pytest.mark.parametrize("translation", [10 ** i for i in [4]])
+def test_fast_sqeuclidean_translation_invariance(
+    seed,
+    n_samples,
+    n_features,
+    k,
+    translation,
+    dtype=np.float64,
+):
+    # The fast squared euclidean strategy should be translation invariant.
+    if n_samples < k:
+        pytest.skip(
+            f"Skipping as n_samples (={n_samples}) < n_neighbors (={k})",
+            allow_module_level=True,
+        )
+
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    reference_dist, reference_indices = PairwiseDistancesArgKmin.get_for(
+        X, Y, k, metric="fast_sqeuclidean"
+    ).compute(return_distance=True)
+
+    dist, indices = PairwiseDistancesArgKmin.get_for(
+        X + translation, Y + translation, k, metric="fast_sqeuclidean"
+    ).compute(return_distance=True)
+
+    assert_argkmin_results_equality(reference_dist, dist, reference_indices, indices)
+
+
+@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("num_threads", [1, 2, 8])
+def test_sqeuclidean_row_norms(
+    seed,
+    n_samples,
+    n_features,
+    num_threads,
+    dtype=np.float64,
+):
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
+    sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads))
+
+    assert_allclose(sq_row_norm_reference, sq_row_norm)
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index ceb1df3420e38..779ed9e39f34a 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -399,7 +399,7 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator):
     >>> X, y = make_multilabel_classification(n_classes=3, random_state=0)
     >>> clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, y)
     >>> clf.predict(X[-2:])
-    array([[1, 1, 0], [1, 1, 1]])
+    array([[1, 1, 1], [1, 1, 1]])
     """
 
     def __init__(self, estimator, *, n_jobs=None):
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index 8a0934eecf142..ff5ad4875d77d 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -4,8 +4,8 @@
 """
 
 from ._ball_tree import BallTree
+from ._distance_metric import DistanceMetric
 from ._kd_tree import KDTree
-from ._dist_metrics import DistanceMetric
 from ._graph import kneighbors_graph, radius_neighbors_graph
 from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer
 from ._unsupervised import NearestNeighbors
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 4e01cf2cd1076..e21a0ffb36a28 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,6 +23,10 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
+from ..metrics._pairwise_distances_reduction import (
+    PairwiseDistancesArgKmin,
+    PairwiseDistancesRadiusNeighborhood,
+)
 from ..utils import (
     check_array,
     gen_even_slices,
@@ -50,6 +54,8 @@
             "correlation",
             "cosine",
             "dice",
+            "fast_euclidean",
+            "fast_sqeuclidean",
             "hamming",
             "jaccard",
             "kulsinski",
@@ -361,6 +367,20 @@ def _check_algorithm_metric(self):
         else:
             alg_check = self.algorithm
 
+        if alg_check != "brute" and self.metric in (
+            "fast_sqeuclidean",
+            "fast_euclidean",
+        ):
+            alternative = self.metric.replace("fast_", "")
+            warnings.warn(
+                f"'{self.metric}' is only available for algorithm='brute' but"
+                f" algorithm='{self.algorithm}' is used. Falling back on"
+                f" metric='{alternative}'.",
+                UserWarning,
+                stacklevel=3,
+            )
+            self.metric = alternative
+
         if callable(self.metric):
             if self.algorithm == "kd_tree":
                 # callable metric is only valid for brute force and ball_tree
@@ -397,7 +417,9 @@ def _check_algorithm_metric(self):
     def _fit(self, X, y=None):
         if self._get_tags()["requires_y"]:
             if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
-                X, y = self._validate_data(X, y, accept_sparse="csr", multi_output=True)
+                X, y = self._validate_data(
+                    X, y, accept_sparse="csr", multi_output=True, order="C"
+                )
 
             if is_classifier(self):
                 # Classification targets require a specific format
@@ -432,7 +454,7 @@ def _fit(self, X, y=None):
 
         else:
             if not isinstance(X, (KDTree, BallTree, NeighborsBase)):
-                X = self._validate_data(X, accept_sparse="csr")
+                X = self._validate_data(X, accept_sparse="csr", order="C")
 
         self._check_algorithm_metric()
         if self.metric_params is None:
@@ -499,6 +521,11 @@ def _fit(self, X, y=None):
         if issparse(X):
             if self.algorithm not in ("auto", "brute"):
                 warnings.warn("cannot use tree with sparse input: using brute force")
+
+            if self.metric in ("fast_sqeuclidean", "fast_euclidean"):
+                # The fast alternatives are only available for dense datasets.
+                self.effective_metric_ = self.effective_metric_.replace("fast_", "")
+
             if self.effective_metric_ not in VALID_METRICS_SPARSE[
                 "brute"
             ] and not callable(self.effective_metric_):
@@ -542,6 +569,8 @@ def _fit(self, X, y=None):
                 else:
                     self._fit_method = "brute"
 
+        specialised_metrics = {"euclidean", "sqeuclidean"}
+
         if self._fit_method == "ball_tree":
             self._tree = BallTree(
                 X,
@@ -557,6 +586,13 @@ def _fit(self, X, y=None):
                 **self.effective_metric_params_,
             )
         elif self._fit_method == "brute":
+            if (
+                self.effective_metric_ in specialised_metrics
+                and self.metric not in specialised_metrics
+            ):
+                # In that case, the standard stabler metric has not been explicitly
+                # specified by the user, so we prefer its fast alternative.
+                self.effective_metric_ = f"fast_{self.effective_metric_}"
             self._tree = None
         else:
             raise ValueError("algorithm = '%s' not recognized" % self.algorithm)
@@ -633,10 +669,7 @@ def _kneighbors_reduce_func(self, dist, start, n_neighbors, return_distance):
         # argpartition doesn't guarantee sorted order, so we sort again
         neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])]
         if return_distance:
-            if self.effective_metric_ == "euclidean":
-                result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
-            else:
-                result = dist[sample_range, neigh_ind], neigh_ind
+            result = dist[sample_range, neigh_ind], neigh_ind
         else:
             result = neigh_ind
         return result
@@ -706,10 +739,21 @@ class from an array representing our data set and ask who's
                 % type(n_neighbors)
             )
 
+        use_pairwise_distances_reductions = (
+            self._fit_method == "brute"
+            and PairwiseDistancesArgKmin.is_usable_for(
+                X if X is not None else self._fit_X, self._fit_X, self.effective_metric_
+            )
+        )
+
         if X is not None:
             query_is_train = False
             if self.metric == "precomputed":
                 X = _check_precomputed(X)
+            elif use_pairwise_distances_reductions:
+                # We force the C-contiguity even if it creates a copy for F-ordered
+                # arrays because this implementation is more efficient.
+                X = self._validate_data(X, accept_sparse="csr", reset=False, order="C")
             else:
                 X = self._validate_data(X, accept_sparse="csr", reset=False)
         else:
@@ -728,24 +772,40 @@ class from an array representing our data set and ask who's
 
         n_jobs = effective_n_jobs(self.n_jobs)
         chunked_results = None
-        if self._fit_method == "brute" and self.metric == "precomputed" and issparse(X):
+        if use_pairwise_distances_reductions:
+            results = PairwiseDistancesArgKmin.get_for(
+                X=X,
+                Y=self._fit_X,
+                k=n_neighbors,
+                metric=self.effective_metric_,
+                metric_kwargs=self.effective_metric_params_,
+                n_threads=self.n_jobs,
+            ).compute(
+                strategy="auto",
+                return_distance=return_distance,
+            )
+
+        elif (
+            self._fit_method == "brute" and self.metric == "precomputed" and issparse(X)
+        ):
             results = _kneighbors_from_graph(
                 X, n_neighbors=n_neighbors, return_distance=return_distance
             )
 
         elif self._fit_method == "brute":
+            # TODO: support sparse matrices
+            # When ArgKmin is not supported and when the user ask for a
+            # fast alternative, we need to revert to the standard.
+            if self.effective_metric_ in ("fast_sqeuclidean", "fast_euclidean"):
+                # The fast alternatives are only available for dense datasets.
+                self.effective_metric_ = self.effective_metric_.replace("fast_", "")
+
             reduce_func = partial(
                 self._kneighbors_reduce_func,
                 n_neighbors=n_neighbors,
                 return_distance=return_distance,
             )
 
-            # for efficiency, use squared euclidean distances
-            if self.effective_metric_ == "euclidean":
-                kwds = {"squared": True}
-            else:
-                kwds = self.effective_metric_params_
-
             chunked_results = list(
                 pairwise_distances_chunked(
                     X,
@@ -753,7 +813,7 @@ class from an array representing our data set and ask who's
                     reduce_func=reduce_func,
                     metric=self.effective_metric_,
                     n_jobs=n_jobs,
-                    **kwds,
+                    **self.effective_metric_params_,
                 )
             )
 
@@ -943,10 +1003,7 @@ def _radius_neighbors_reduce_func(self, dist, start, radius, return_distance):
         neigh_ind = [np.where(d <= radius)[0] for d in dist]
 
         if return_distance:
-            if self.effective_metric_ == "euclidean":
-                dist = [np.sqrt(d[neigh_ind[i]]) for i, d in enumerate(dist)]
-            else:
-                dist = [d[neigh_ind[i]] for i, d in enumerate(dist)]
+            dist = [d[neigh_ind[i]] for i, d in enumerate(dist)]
             results = dist, neigh_ind
         else:
             results = neigh_ind
@@ -1030,10 +1087,21 @@ class from an array representing our data set and ask who's
         """
         check_is_fitted(self)
 
+        use_pairwise_distances_reductions = (
+            self._fit_method == "brute"
+            and PairwiseDistancesRadiusNeighborhood.is_usable_for(
+                X if X is not None else self._fit_X, self._fit_X, self.effective_metric_
+            )
+        )
+
         if X is not None:
             query_is_train = False
             if self.metric == "precomputed":
                 X = _check_precomputed(X)
+            elif use_pairwise_distances_reductions:
+                # We force the C-contiguity even if it creates a copy for F-ordered
+                # arrays because this implementation is more efficient.
+                X = self._validate_data(X, accept_sparse="csr", reset=False, order="C")
             else:
                 X = self._validate_data(X, accept_sparse="csr", reset=False)
         else:
@@ -1043,18 +1111,33 @@ class from an array representing our data set and ask who's
         if radius is None:
             radius = self.radius
 
-        if self._fit_method == "brute" and self.metric == "precomputed" and issparse(X):
+        if use_pairwise_distances_reductions:
+            results = PairwiseDistancesRadiusNeighborhood.get_for(
+                X=X,
+                Y=self._fit_X,
+                radius=radius,
+                metric=self.effective_metric_,
+                metric_kwargs=self.effective_metric_params_,
+                n_threads=self.n_jobs,
+                sort_results=sort_results,
+            ).compute(
+                strategy="auto",
+                return_distance=return_distance,
+            )
+
+        elif (
+            self._fit_method == "brute" and self.metric == "precomputed" and issparse(X)
+        ):
             results = _radius_neighbors_from_graph(
                 X, radius=radius, return_distance=return_distance
             )
 
         elif self._fit_method == "brute":
-            # for efficiency, use squared euclidean distances
-            if self.effective_metric_ == "euclidean":
-                radius *= radius
-                kwds = {"squared": True}
-            else:
-                kwds = self.effective_metric_params_
+            # When RadiusNeighborhood is not supported and when the user ask for a
+            # fast alternative, we need to revert to the standard.
+            if self.effective_metric_ in ("fast_sqeuclidean", "fast_euclidean"):
+                # The fast alternatives are only available for dense datasets.
+                self.effective_metric_ = self.effective_metric_.replace("fast_", "")
 
             reduce_func = partial(
                 self._radius_neighbors_reduce_func,
@@ -1068,7 +1151,7 @@ class from an array representing our data set and ask who's
                 reduce_func=reduce_func,
                 metric=self.effective_metric_,
                 n_jobs=self.n_jobs,
-                **kwds,
+                **self.effective_metric_params_,
             )
             if return_distance:
                 neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results)
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index 9f90414994550..32a907d1c6dea 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -142,7 +142,6 @@
 #                                   BinaryTree tree2, ITYPE_t i_node2):
 #     """Compute the maximum distance between two nodes"""
 
-cimport cython
 cimport numpy as np
 from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma
 from libc.math cimport fmin, fmax
@@ -151,16 +150,16 @@ from libc.string cimport memcpy
 
 import numpy as np
 import warnings
-from ..utils import check_array
-
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
-from ._typedefs import DTYPE, ITYPE
 
-from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist,
-                             euclidean_dist_to_rdist, euclidean_rdist_to_dist)
+from ..metrics._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, euclidean_dist_to_rdist)
 
 from ._partition_nodes cimport partition_node_indices
 
+from ..utils import check_array
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
+from ..utils._typedefs import DTYPE, ITYPE
+from ..utils._heap cimport simultaneous_sort as _simultaneous_sort, heap_push
+
 cdef extern from "numpy/arrayobject.h":
     void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
 
@@ -226,7 +225,7 @@ leaf_size : positive int, default=40
     the case that ``n_samples < leaf_size``.
 
 metric : str or DistanceMetric object
-    the distance metric to use for the tree.  Default='minkowski'
+    The distance metric to use for the tree.  Default='minkowski'
     with p=2 (that is, a euclidean metric). See the documentation
     of the DistanceMetric class for a list of available metrics.
     {binary_tree}.valid_metrics gives a list of the metrics which
@@ -489,27 +488,6 @@ def kernel_norm(h, d, kernel, return_log=False):
         return np.exp(result)
 
 
-######################################################################
-# Tree Utility Routines
-cdef inline void swap(DITYPE_t* arr, ITYPE_t i1, ITYPE_t i2):
-    """swap the values at index i1 and i2 of arr"""
-    cdef DITYPE_t tmp = arr[i1]
-    arr[i1] = arr[i2]
-    arr[i2] = tmp
-
-
-cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr,
-                           ITYPE_t i1, ITYPE_t i2) nogil:
-    """swap the values at inex i1 and i2 of both darr and iarr"""
-    cdef DTYPE_t dtmp = darr[i1]
-    darr[i1] = darr[i2]
-    darr[i2] = dtmp
-
-    cdef ITYPE_t itmp = iarr[i1]
-    iarr[i1] = iarr[i2]
-    iarr[i2] = itmp
-
-
 cdef class NeighborsHeap:
     """A max-heap structure to keep track of distances/indices of neighbors
 
@@ -564,52 +542,11 @@ cdef class NeighborsHeap:
     cdef int _push(self, ITYPE_t row, DTYPE_t val,
                    ITYPE_t i_val) nogil except -1:
         """push (val, i_val) into the given row"""
-        cdef ITYPE_t i, ic1, ic2, i_swap
-        cdef ITYPE_t size = self.distances.shape[1]
-        cdef DTYPE_t* dist_arr = &self.distances[row, 0]
-        cdef ITYPE_t* ind_arr = &self.indices[row, 0]
-
-        # check if val should be in heap
-        if val >= dist_arr[0]:
-            return 0
-
-        # insert val at position zero
-        dist_arr[0] = val
-        ind_arr[0] = i_val
-
-        # descend the heap, swapping values until the max heap criterion is met
-        i = 0
-        while True:
-            ic1 = 2 * i + 1
-            ic2 = ic1 + 1
-
-            if ic1 >= size:
-                break
-            elif ic2 >= size:
-                if dist_arr[ic1] > val:
-                    i_swap = ic1
-                else:
-                    break
-            elif dist_arr[ic1] >= dist_arr[ic2]:
-                if val < dist_arr[ic1]:
-                    i_swap = ic1
-                else:
-                    break
-            else:
-                if val < dist_arr[ic2]:
-                    i_swap = ic2
-                else:
-                    break
-
-            dist_arr[i] = dist_arr[i_swap]
-            ind_arr[i] = ind_arr[i_swap]
-
-            i = i_swap
-
-        dist_arr[i] = val
-        ind_arr[i] = i_val
-
-        return 0
+        cdef:
+            ITYPE_t size = self.distances.shape[1]
+            DTYPE_t* dist_arr = &self.distances[row, 0]
+            ITYPE_t* ind_arr = &self.indices[row, 0]
+        return heap_push(dist_arr, ind_arr, size, val, i_val)
 
     cdef int _sort(self) except -1:
         """simultaneously sort the distances and indices"""
@@ -622,68 +559,6 @@ cdef class NeighborsHeap:
                                distances.shape[1])
         return 0
 
-
-cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx,
-                            ITYPE_t size) nogil except -1:
-    """
-    Perform a recursive quicksort on the dist array, simultaneously
-    performing the same swaps on the idx array.  The equivalent in
-    numpy (though quite a bit slower) is
-
-    def simultaneous_sort(dist, idx):
-        i = np.argsort(dist)
-        return dist[i], idx[i]
-    """
-    cdef ITYPE_t pivot_idx, i, store_idx
-    cdef DTYPE_t pivot_val
-
-    # in the small-array case, do things efficiently
-    if size <= 1:
-        pass
-    elif size == 2:
-        if dist[0] > dist[1]:
-            dual_swap(dist, idx, 0, 1)
-    elif size == 3:
-        if dist[0] > dist[1]:
-            dual_swap(dist, idx, 0, 1)
-        if dist[1] > dist[2]:
-            dual_swap(dist, idx, 1, 2)
-            if dist[0] > dist[1]:
-                dual_swap(dist, idx, 0, 1)
-    else:
-        # Determine the pivot using the median-of-three rule.
-        # The smallest of the three is moved to the beginning of the array,
-        # the middle (the pivot value) is moved to the end, and the largest
-        # is moved to the pivot index.
-        pivot_idx = size / 2
-        if dist[0] > dist[size - 1]:
-            dual_swap(dist, idx, 0, size - 1)
-        if dist[size - 1] > dist[pivot_idx]:
-            dual_swap(dist, idx, size - 1, pivot_idx)
-            if dist[0] > dist[size - 1]:
-                dual_swap(dist, idx, 0, size - 1)
-        pivot_val = dist[size - 1]
-
-        # partition indices about pivot.  At the end of this operation,
-        # pivot_idx will contain the pivot value, everything to the left
-        # will be smaller, and everything to the right will be larger.
-        store_idx = 0
-        for i in range(size - 1):
-            if dist[i] < pivot_val:
-                dual_swap(dist, idx, i, store_idx)
-                store_idx += 1
-        dual_swap(dist, idx, store_idx, size - 1)
-        pivot_idx = store_idx
-
-        # recursively sort each side of the pivot
-        if pivot_idx > 1:
-            _simultaneous_sort(dist, idx, pivot_idx)
-        if pivot_idx + 2 < size:
-            _simultaneous_sort(dist + pivot_idx + 1,
-                               idx + pivot_idx + 1,
-                               size - pivot_idx - 1)
-    return 0
-
 #------------------------------------------------------------
 # find_node_split_dim:
 #  this computes the equivalent of
@@ -878,7 +753,7 @@ def newObj(obj):
 
 ######################################################################
 # define the reverse mapping of VALID_METRICS
-from ._dist_metrics import get_valid_metric_ids
+from ..metrics._dist_metrics import get_valid_metric_ids
 VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS)
 
 
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index d616eaa2f32a8..e744e52c40c59 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -11,7 +11,7 @@
 import numpy as np
 from scipy import stats
 from ..utils.extmath import weighted_mode
-from ..utils.validation import _is_arraylike, _num_samples
+from ..utils.validation import _is_arraylike, _num_samples, check_is_fitted
 
 import warnings
 from ._base import _check_weights, _get_weights
@@ -65,10 +65,10 @@ class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
 
     metric : str or callable, default='minkowski'
-        The distance metric to use for the tree.  The default metric is
+        The distance metric to use for the tree. The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -211,7 +211,21 @@ def predict(self, X):
         y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
             Class labels for each data sample.
         """
-        neigh_dist, neigh_ind = self.kneighbors(X)
+        # Duplicated because of the check on self.effective_metric_'s value
+        # TODO: remove check_is_fitted duplication
+        check_is_fitted(self)
+
+        X = self._validate_data(X, accept_sparse="csr", reset=False)
+
+        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
+            # In that case, it is safe to use the fast alternative which
+            # does not use sqrt on distances as this can be costly.
+            self.effective_metric_ = "fast_sqeuclidean"
+            neigh_dist, neigh_ind = self.kneighbors(X)
+            self.effective_metric_ = "fast_euclidean"
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
+
         classes_ = self.classes_
         _y = self._y
         if not self.outputs_2d_:
@@ -253,7 +267,20 @@ def predict_proba(self, X):
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
         """
-        neigh_dist, neigh_ind = self.kneighbors(X)
+        # Duplicated because of the check on self.effective_metric_'s value
+        # TODO: remove check_is_fitted duplication
+        check_is_fitted(self)
+
+        X = self._validate_data(X, accept_sparse="csr", reset=False)
+
+        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
+            # In that case, it is safe to use the fast alternative which
+            # does not use sqrt on distances as this can be costly.
+            self.effective_metric_ = "fast_sqeuclidean"
+            neigh_dist, neigh_ind = self.kneighbors(X)
+            self.effective_metric_ = "fast_euclidean"
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
 
         classes_ = self.classes_
         _y = self._y
@@ -344,8 +371,8 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, Neighbors
     metric : str or callable, default='minkowski'
         Distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -609,10 +636,24 @@ def predict_proba(self, X):
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
         """
+        # Duplicated because of the check on self.effective_metric_'s value
+        # TODO: remove check_is_fitted duplication
+        check_is_fitted(self)
 
         n_queries = _num_samples(X)
 
-        neigh_dist, neigh_ind = self.radius_neighbors(X)
+        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
+            # In that case, it is safe to use the fast alternative which
+            # does not use sqrt on distances as this can be costly.
+            original_radius = self.radius
+            self.effective_metric_ = "fast_sqeuclidean"
+            self.radius = original_radius * original_radius
+            neigh_dist, neigh_ind = self.radius_neighbors(X)
+            self.radius = original_radius
+            self.effective_metric_ = "fast_euclidean"
+        else:
+            neigh_dist, neigh_ind = self.radius_neighbors(X)
+
         outlier_mask = np.zeros(n_queries, dtype=bool)
         outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
         outliers = np.flatnonzero(outlier_mask)
diff --git a/sklearn/neighbors/_distance_metric.py b/sklearn/neighbors/_distance_metric.py
new file mode 100644
index 0000000000000..10d6e24139068
--- /dev/null
+++ b/sklearn/neighbors/_distance_metric.py
@@ -0,0 +1,20 @@
+# TODO: Remove this file in 1.2
+import warnings
+
+from ..metrics import DistanceMetric as _DistanceMetric
+
+
+class DistanceMetric(_DistanceMetric):
+    @classmethod
+    def _warn(cls):
+        warnings.warn(
+            "sklearn.neighbors.DistanceMetric has been moved "
+            "to sklearn.metrics.DistanceMetric in 1.0. "
+            "This import path will be removed in 1.2",
+            category=FutureWarning,
+        )
+
+    @classmethod
+    def get_metric(cls, metric, **kwargs):
+        DistanceMetric._warn()
+        return _DistanceMetric.get_metric(metric, **kwargs)
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index e6fdeffe3b291..9afa37b71a808 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -65,10 +65,11 @@ def kneighbors_graph(
         between neighbors according to the given metric.
 
     metric : str, default='minkowski'
-        The distance metric used to calculate the k-Neighbors for each sample
-        point. The DistanceMetric class gives a list of available metrics.
-        The default distance is 'euclidean' ('minkowski' metric with the p
-        param equal to 2.)
+        The distance metric to use for the tree. The default metric is
+        minkowski, and with p=2 is equivalent to the standard Euclidean
+        metric.
+        For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
 
     p : int, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
@@ -157,10 +158,11 @@ def radius_neighbors_graph(
         between neighbors according to the given metric.
 
     metric : str, default='minkowski'
-        The distance metric used to calculate the neighbors within a
-        given radius for each sample point. The DistanceMetric class
-        gives a list of available metrics. The default distance is
-        'euclidean' ('minkowski' metric with the param equal to 2.)
+        The distance metric to use for the tree. The default metric is
+        minkowski, and with p=2 is equivalent to the standard Euclidean
+        metric.
+        For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
 
     p : int, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd
index 522e826632824..94b02002d7a1e 100644
--- a/sklearn/neighbors/_partition_nodes.pxd
+++ b/sklearn/neighbors/_partition_nodes.pxd
@@ -1,4 +1,4 @@
-from ._typedefs cimport DTYPE_t, ITYPE_t
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 
 cdef int partition_node_indices(
         DTYPE_t *data,
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 75ef124ad1711..5ea2db7ce4d21 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -18,6 +18,7 @@
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import RegressorMixin
 from ..utils.deprecation import deprecated
+from ..utils.validation import check_is_fitted
 
 
 class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
@@ -75,8 +76,8 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -226,7 +227,20 @@ def predict(self, X):
         y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
             Target values.
         """
-        neigh_dist, neigh_ind = self.kneighbors(X)
+        # Duplicated because of the check on self.effective_metric_'s value
+        # TODO: remove check_is_fitted duplication
+        check_is_fitted(self)
+
+        X = self._validate_data(X, accept_sparse="csr", reset=False)
+
+        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
+            # In that case, it is safe to use the fast alternative which
+            # does not use sqrt on distances as this can be costly.
+            self.effective_metric_ = "fast_sqeuclidean"
+            neigh_dist, neigh_ind = self.kneighbors(X)
+            self.effective_metric_ = "fast_euclidean"
+        else:
+            neigh_dist, neigh_ind = self.kneighbors(X)
 
         weights = _get_weights(neigh_dist, self.weights)
 
@@ -306,8 +320,8 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBa
     metric : str or callable, default='minkowski'
         The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -434,7 +448,23 @@ def predict(self, X):
                 dtype=double
             Target values.
         """
-        neigh_dist, neigh_ind = self.radius_neighbors(X)
+        # Duplicated because of the check on self.effective_metric_'s value
+        # TODO: remove check_is_fitted duplication
+        check_is_fitted(self)
+
+        X = self._validate_data(X, accept_sparse="csr", reset=False)
+
+        if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean":
+            # In that case, it is safe to use the fast alternative which
+            # does not use sqrt on distances as this can be costly.
+            original_radius = self.radius
+            self.effective_metric_ = "fast_sqeuclidean"
+            self.radius = original_radius * original_radius
+            neigh_dist, neigh_ind = self.radius_neighbors(X)
+            self.radius = original_radius
+            self.effective_metric_ = "fast_euclidean"
+        else:
+            neigh_dist, neigh_ind = self.radius_neighbors(X)
 
         weights = _get_weights(neigh_dist, self.weights)
 
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 6b6eec1a3112b..440ac41eb71d5 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -41,8 +41,8 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py
index 85305efc29c78..aa19ba501b18d 100644
--- a/sklearn/neighbors/setup.py
+++ b/sklearn/neighbors/setup.py
@@ -32,19 +32,6 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
-    config.add_extension(
-        "_dist_metrics",
-        sources=["_dist_metrics.pyx"],
-        include_dirs=[numpy.get_include(), os.path.join(numpy.get_include(), "numpy")],
-        libraries=libraries,
-    )
-
-    config.add_extension(
-        "_typedefs",
-        sources=["_typedefs.pyx"],
-        include_dirs=[numpy.get_include()],
-        libraries=libraries,
-    )
     config.add_extension(
         "_quad_tree",
         sources=["_quad_tree.pyx"],
diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py
index c751539f2a1ae..a823a03251a1b 100644
--- a/sklearn/neighbors/tests/test_ball_tree.py
+++ b/sklearn/neighbors/tests/test_ball_tree.py
@@ -4,7 +4,6 @@
 import pytest
 from numpy.testing import assert_array_almost_equal
 from sklearn.neighbors._ball_tree import BallTree
-from sklearn.neighbors import DistanceMetric
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_array
 from sklearn.utils._testing import _convert_container
@@ -40,6 +39,8 @@
 
 
 def brute_force_neighbors(X, Y, k, metric, **kwargs):
+    from sklearn.metrics import DistanceMetric
+
     X, Y = check_array(X), check_array(Y)
     D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
     ind = np.argsort(D, axis=1)[:, :k]
@@ -84,3 +85,13 @@ def test_array_object_type():
     X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
     with pytest.raises(ValueError, match="setting an array element with a sequence"):
         BallTree(X)
+
+
+def test_bad_pyfunc_metric():
+    def wrong_distance(x, y):
+        return "1"
+
+    X = np.ones((5, 2))
+    msg = "Custom distance function must accept two vectors"
+    with pytest.raises(TypeError, match=msg):
+        BallTree(X, metric=wrong_distance)
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index a9592ff9f2c51..6121ee6b1f2ee 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -3,6 +3,7 @@
 import pytest
 import re
 import numpy as np
+import scipy
 from scipy.sparse import (
     bsr_matrix,
     coo_matrix,
@@ -22,11 +23,21 @@
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
-from sklearn.neighbors import VALID_METRICS_SPARSE, VALID_METRICS
-from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed
+from sklearn.neighbors import (
+    VALID_METRICS_SPARSE,
+)
+from sklearn.neighbors._base import (
+    _is_sorted_by_data,
+    _check_precomputed,
+    KNeighborsMixin,
+)
 from sklearn.pipeline import make_pipeline
-from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+    get_dummy_metric_kwargs,
+)
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.validation import check_random_state
 from sklearn.utils.fixes import sp_version, parse_version
@@ -50,6 +61,9 @@
 SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,)
 
 ALGORITHMS = ("ball_tree", "brute", "kd_tree", "auto")
+COMMON_VALID_METRICS = sorted(
+    set.intersection(*map(set, neighbors.VALID_METRICS.values()))
+)
 P = (1, 2, 3, 4, np.inf)
 JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys())
 
@@ -70,42 +84,316 @@ def _weight_func(dist):
     return retval ** 2
 
 
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("n_query_pts", [1, 10, 100])
+@pytest.mark.parametrize("n_neighbors", [1, 10, 100])
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
 def test_unsupervised_kneighbors(
-    n_samples=20, n_features=5, n_query_pts=2, n_neighbors=5
+    n_samples,
+    n_features,
+    n_query_pts,
+    n_neighbors,
+    metric,
 ):
-    # Test unsupervised neighbors methods
-    X = rng.rand(n_samples, n_features)
+    # The different algorithms must return identical results
+    # on their common metrics, with and without returning
+    # distances
 
-    test = rng.rand(n_query_pts, n_features)
+    # Redefining the rng locally to use the same generated X
+    local_rng = np.random.RandomState(0)
+    X = local_rng.rand(n_samples, n_features)
 
-    for p in P:
-        results_nodist = []
-        results = []
+    test = local_rng.rand(n_query_pts, n_features)
 
-        for algorithm in ALGORITHMS:
-            neigh = neighbors.NearestNeighbors(
-                n_neighbors=n_neighbors, algorithm=algorithm, p=p
-            )
-            neigh.fit(X)
+    results_nodist = []
+    results = []
 
-            results_nodist.append(neigh.kneighbors(test, return_distance=False))
-            results.append(neigh.kneighbors(test, return_distance=True))
+    for algorithm in ALGORITHMS:
+        neigh = neighbors.NearestNeighbors(
+            n_neighbors=n_neighbors, algorithm=algorithm, metric=metric
+        )
+        neigh.fit(X)
+
+        results_nodist.append(neigh.kneighbors(test, return_distance=False))
+        results.append(neigh.kneighbors(test, return_distance=True))
+
+    for i in range(len(results) - 1):
+        algorithm = ALGORITHMS[i]
+        next_algorithm = ALGORITHMS[i + 1]
+
+        indices_no_dist = results_nodist[i]
+        distances, next_distances = results[i][0], results[i + 1][0]
+        indices, next_indices = results[i][1], results[i + 1][1]
+        assert_allclose(
+            indices_no_dist,
+            indices,
+            err_msg=(
+                f"The '{algorithm}' algorithm returns different"
+                "indices depending on 'return_distances'."
+            ),
+        )
+        assert_allclose(
+            indices,
+            next_indices,
+            err_msg=(
+                f"The '{algorithm}' and '{next_algorithm}' "
+                "algorithms return different indices."
+            ),
+        )
+        assert_allclose(
+            distances,
+            next_distances,
+            err_msg=(
+                f"The '{algorithm}' and '{next_algorithm}' "
+                "algorithms return different distances."
+            ),
+        )
 
-        for i in range(len(results) - 1):
-            assert_array_almost_equal(results_nodist[i], results[i][1])
-            assert_array_almost_equal(results[i][0], results[i + 1][0])
-            assert_array_almost_equal(results[i][1], results[i + 1][1])
 
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("n_query_pts", [1, 10, 100])
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
+@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)])
+@pytest.mark.parametrize(
+    "NeighborsMixinSubclass",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.RadiusNeighborsRegressor,
+    ],
+)
+def test_neigh_predictions_algorithm_agnosticity(
+    n_samples,
+    n_features,
+    n_query_pts,
+    metric,
+    n_neighbors,
+    radius,
+    NeighborsMixinSubclass,
+):
+    # The different algorithms must return identical predictions results
+    # on their common metrics.
+
+    # Redefining the rng locally to use the same generated X
+    local_rng = np.random.RandomState(0)
+    X = local_rng.rand(n_samples, n_features)
+    y = local_rng.randint(3, size=n_samples)
+
+    query = local_rng.rand(n_query_pts, n_features)
+
+    predict_results = []
+
+    parameter = (
+        n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius
+    )
+
+    for algorithm in ALGORITHMS:
+        neigh = NeighborsMixinSubclass(parameter, algorithm=algorithm, metric=metric)
+        neigh.fit(X, y)
+
+        predict_results.append(neigh.predict(query))
+
+    for i in range(len(predict_results) - 1):
+        algorithm = ALGORITHMS[i]
+        next_algorithm = ALGORITHMS[i + 1]
+
+        predictions, next_predictions = predict_results[i], predict_results[i + 1]
+
+        assert_allclose(
+            predictions,
+            next_predictions,
+            err_msg=(
+                f"The '{algorithm}' and '{next_algorithm}' "
+                "algorithms return different predictions."
+            ),
+        )
 
+
+@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)])
 @pytest.mark.parametrize(
-    "NearestNeighbors",
+    "NeighborsMixinSubclass",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+        neighbors.RadiusNeighborsClassifier,
+        neighbors.RadiusNeighborsRegressor,
+    ],
+)
+def test_neighs_predictions_fast_euclidean_correctness(
+    seed,
+    n_samples,
+    n_features,
+    n_neighbors,
+    radius,
+    NeighborsMixinSubclass,
+    dtype=np.float64,
+):
+    # The fast euclidean strategy must return results
+    # that are close to the ones obtained with the euclidean distance
+    if n_samples < n_neighbors:
+        pytest.skip(
+            f"Skipping as n_samples (={n_samples}) < n_neighbors (={n_neighbors})",
+            allow_module_level=True,
+        )
+
+    rng = np.random.RandomState(seed)
+    X = rng.rand(n_samples, n_features).astype(dtype)
+    y = rng.randint(3, size=n_samples)
+
+    parameter = (
+        n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius
+    )
+
+    euclidean_est = NeighborsMixinSubclass(
+        parameter, algorithm="brute", metric="euclidean"
+    ).fit(X, y)
+    euclidean_pred = euclidean_est.predict(X)
+
+    fast_euclidean_clf = NeighborsMixinSubclass(
+        parameter, algorithm="brute", metric="fast_euclidean"
+    ).fit(X, y)
+    fast_euclidean_pred = fast_euclidean_clf.predict(X)
+
+    assert_allclose(euclidean_pred, fast_euclidean_pred)
+
+
+@pytest.mark.parametrize(
+    "KNeighborsEstimator",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+    ],
+)
+@pytest.mark.parametrize(
+    "weights, expected_kneighbors_metric",
+    [
+        ("uniform", "fast_sqeuclidean"),
+        ("distance", "fast_euclidean"),
+        (lambda x: x, "fast_euclidean"),
+    ],
+)
+def test_knn_prediction_fast_euclidean_overriding(
+    KNeighborsEstimator,
+    weights,
+    expected_kneighbors_metric,
+    n_samples=1000,
+    n_features=100,
+    dtype=np.float64,
+):
+    # The fast squared euclidean metric must be used over the fast euclidean
+    # metric solely when using the uniform sample-weighting.
+    class MockedKNeighborsEstimator(KNeighborsEstimator):
+        def kneighbors(self, *args, **kwargs):
+            self.kneighbors_metric_ = self.effective_metric_
+            return super().kneighbors(*args, **kwargs)
+
+    rng = np.random.RandomState(0)
+    X = rng.rand(n_samples, n_features).astype(dtype)
+    y = rng.randint(3, size=n_samples)
+
+    parameter = 10
+
+    fast_euclidean_est = MockedKNeighborsEstimator(
+        parameter,
+        algorithm="brute",
+        metric="fast_euclidean",
+        weights=weights,
+    ).fit(X, y)
+
+    # effective_metric_ must not be changed
+    assert fast_euclidean_est.effective_metric_ == "fast_euclidean"
+    fast_euclidean_est.predict(X)
+    assert fast_euclidean_est.kneighbors_metric_ == expected_kneighbors_metric
+    assert fast_euclidean_est.effective_metric_ == "fast_euclidean"
+
+
+@pytest.mark.parametrize(
+    "KNeighborsEstimator",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+    ],
+)
+@pytest.mark.parametrize("algorithm", ["kd_tree", "ball_tree"])
+def test_knn_prediction_fast_alternatives_fall_back_on_tree(
+    KNeighborsEstimator,
+    algorithm,
+    specified_metric="fast_euclidean",
+    fall_back_metric="euclidean",
+    parameter=10,
+    n_samples=1000,
+    n_features=100,
+    dtype=np.float64,
+):
+    # The fast euclidean metric can't be used on "kd_tree", "ball_tree".
+    rng = np.random.RandomState(0)
+    X = rng.rand(n_samples, n_features).astype(dtype)
+    y = rng.randint(3, size=n_samples)
+
+    est = KNeighborsEstimator(
+        parameter,
+        algorithm=algorithm,
+        metric=specified_metric,
+    )
+    with pytest.warns(
+        UserWarning,
+        match=(
+            f"'{specified_metric}' is only available for algorithm='brute' but "
+            f"algorithm='{algorithm}' is used. Falling "
+            f"back on metric='{fall_back_metric}'."
+        ),
+    ):
+        est.fit(X, y)
+
+    assert est.metric == fall_back_metric
+    assert est.effective_metric_ == fall_back_metric
+
+
+@pytest.mark.parametrize(
+    "KNeighborsEstimator",
+    [
+        neighbors.KNeighborsClassifier,
+        neighbors.KNeighborsRegressor,
+    ],
+)
+def test_knn_prediction_fast_alternatives_fall_back_on_sparse(
+    KNeighborsEstimator,
+    specified_metric="fast_euclidean",
+    fall_back_metric="euclidean",
+    parameter=10,
+    n_samples=1000,
+    n_features=100,
+    dtype=np.float64,
+):
+    # The fast euclidean metric can't be used on sparse datasets.
+    rng = np.random.RandomState(0)
+    X = scipy.sparse.random(n_samples, n_features, density=0.25, random_state=rng)
+    y = rng.randint(3, size=n_samples)
+
+    est = KNeighborsEstimator(
+        parameter,
+        algorithm="brute",
+        metric=specified_metric,
+    )
+    est.fit(X, y)
+    assert est.effective_metric_ == fall_back_metric
+
+
+@pytest.mark.parametrize(
+    "KNeighborsMixinSubclass",
     [
         neighbors.KNeighborsClassifier,
         neighbors.KNeighborsRegressor,
         neighbors.NearestNeighbors,
     ],
 )
-def test_unsupervised_inputs(NearestNeighbors):
+def test_unsupervised_inputs(KNeighborsMixinSubclass):
     # Test unsupervised inputs for neighbors estimators
 
     X = rng.random_sample((10, 3))
@@ -115,7 +403,7 @@ def test_unsupervised_inputs(NearestNeighbors):
 
     dist1, ind1 = nbrs_fid.kneighbors(X)
 
-    nbrs = NearestNeighbors(n_neighbors=1)
+    nbrs = KNeighborsMixinSubclass(n_neighbors=1)
 
     for data in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)):
         nbrs.fit(data, y)
@@ -1168,19 +1456,19 @@ def test_kneighbors_graph():
     assert_array_almost_equal(A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]])
 
 
-def test_kneighbors_graph_sparse(seed=36):
+@pytest.mark.parametrize("n_neighbors", [1, 2, 3])
+@pytest.mark.parametrize("mode", ["connectivity", "distance"])
+def test_kneighbors_graph_sparse(n_neighbors, mode, seed=36):
     # Test kneighbors_graph to build the k-Nearest Neighbor graph
     # for sparse input.
     rng = np.random.RandomState(seed)
     X = rng.randn(10, 10)
     Xcsr = csr_matrix(X)
 
-    for n_neighbors in [1, 2, 3]:
-        for mode in ["connectivity", "distance"]:
-            assert_array_almost_equal(
-                neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(),
-                neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(),
-            )
+    assert_array_almost_equal(
+        neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(),
+        neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(),
+    )
 
 
 def test_radius_neighbors_graph():
@@ -1196,21 +1484,19 @@ def test_radius_neighbors_graph():
     )
 
 
-def test_radius_neighbors_graph_sparse(seed=36):
+@pytest.mark.parametrize("n_neighbors", [1, 2, 3])
+@pytest.mark.parametrize("mode", ["connectivity", "distance"])
+def test_radius_neighbors_graph_sparse(n_neighbors, mode, seed=36):
     # Test radius_neighbors_graph to build the Nearest Neighbor graph
     # for sparse input.
     rng = np.random.RandomState(seed)
     X = rng.randn(10, 10)
     Xcsr = csr_matrix(X)
 
-    for n_neighbors in [1, 2, 3]:
-        for mode in ["connectivity", "distance"]:
-            assert_array_almost_equal(
-                neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(),
-                neighbors.radius_neighbors_graph(
-                    Xcsr, n_neighbors, mode=mode
-                ).toarray(),
-            )
+    assert_array_almost_equal(
+        neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(),
+        neighbors.radius_neighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(),
+    )
 
 
 def test_neighbors_badargs():
@@ -1275,77 +1561,50 @@ def test_neighbors_badargs():
         nbrs.radius_neighbors_graph(X, mode="blah")
 
 
-def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5):
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
+def test_neighbors_metrics(
+    metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5
+):
     # Test computing the neighbors for various metrics
-    # create a symmetric matrix
-    V = rng.rand(n_features, n_features)
-    VI = np.dot(V, V.T)
-
-    metrics = [
-        ("euclidean", {}),
-        ("manhattan", {}),
-        ("minkowski", dict(p=1)),
-        ("minkowski", dict(p=2)),
-        ("minkowski", dict(p=3)),
-        ("minkowski", dict(p=np.inf)),
-        ("chebyshev", {}),
-        ("seuclidean", dict(V=rng.rand(n_features))),
-        ("wminkowski", dict(p=3, w=rng.rand(n_features))),
-        ("mahalanobis", dict(VI=VI)),
-        ("haversine", {}),
-    ]
-    algorithms = ["brute", "ball_tree", "kd_tree"]
-    X = rng.rand(n_samples, n_features)
+    if metric == "wminkowski" and sp_version >= parse_version("1.8.0"):
+        pytest.skip("wminkowski will be removed in SciPy 1.8.0")
 
+    rng = np.random.RandomState(0)
+    X = rng.rand(n_samples, n_features)
     test = rng.rand(n_query_pts, n_features)
 
-    for metric, metric_params in metrics:
-        if metric == "wminkowski" and sp_version >= parse_version("1.8.0"):
-            # wminkowski will be removed in SciPy 1.8.0
-            continue
-        results = {}
-        p = metric_params.pop("p", 2)
-        for algorithm in algorithms:
-            # KD tree doesn't support all metrics
-            if algorithm == "kd_tree" and metric not in neighbors.KDTree.valid_metrics:
-                est = neighbors.NearestNeighbors(
-                    algorithm=algorithm, metric=metric, metric_params=metric_params
-                )
-                with pytest.raises(ValueError):
-                    est.fit(X)
-                continue
-            neigh = neighbors.NearestNeighbors(
-                n_neighbors=n_neighbors,
-                algorithm=algorithm,
-                metric=metric,
-                p=p,
-                metric_params=metric_params,
-            )
-
-            # Haversine distance only accepts 2D data
-            feature_sl = slice(None, 2) if metric == "haversine" else slice(None)
-
-            neigh.fit(X[:, feature_sl])
-
-            # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
-            ExceptionToAssert = None
-            if (
-                metric == "wminkowski"
-                and algorithm == "brute"
-                and sp_version >= parse_version("1.6.0")
-            ):
-                ExceptionToAssert = DeprecationWarning
+    algorithms = ["brute", "ball_tree", "kd_tree"]
+    metric_params = get_dummy_metric_kwargs(metric, n_features)
+
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        feature_sl = slice(None, 2)
+        X_train = np.ascontiguousarray(X[:, feature_sl])
+        X_test = np.ascontiguousarray(test[:, feature_sl])
+    else:
+        X_train = X
+        X_test = test
+
+    results = {}
+    p = metric_params.pop("p", 2)
+    for algorithm in algorithms:
+        # KD tree doesn't support all metrics
+        neigh = neighbors.NearestNeighbors(
+            n_neighbors=n_neighbors,
+            algorithm=algorithm,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+        )
 
-            with pytest.warns(ExceptionToAssert):
-                results[algorithm] = neigh.kneighbors(
-                    test[:, feature_sl], return_distance=True
-                )
+        neigh.fit(X_train)
+        results[algorithm] = neigh.kneighbors(X_test, return_distance=True)
 
-        assert_array_almost_equal(results["brute"][0], results["ball_tree"][0])
-        assert_array_almost_equal(results["brute"][1], results["ball_tree"][1])
-        if "kd_tree" in results:
-            assert_array_almost_equal(results["brute"][0], results["kd_tree"][0])
-            assert_array_almost_equal(results["brute"][1], results["kd_tree"][1])
+    assert_allclose(results["brute"][0], results["ball_tree"][0])
+    assert_allclose(results["brute"][1], results["ball_tree"][1])
+    if "kd_tree" in results:
+        assert_allclose(results["brute"][0], results["kd_tree"][0])
+        assert_allclose(results["brute"][1], results["kd_tree"][1])
 
 
 def test_callable_metric():
@@ -1369,59 +1628,44 @@ def custom_metric(x1, x2):
     assert_array_almost_equal(dist1, dist2)
 
 
-def test_valid_brute_metric_for_auto_algorithm():
-    X = rng.rand(12, 12)
+@pytest.mark.parametrize("metric", neighbors.VALID_METRICS["brute"])
+def test_valid_brute_metric_for_auto_algorithm(metric, n_samples=20, n_features=12):
+    X = rng.rand(n_samples, n_features)
     Xcsr = csr_matrix(X)
 
-    # check that there is a metric that is valid for brute
-    # but not ball_tree (so we actually test something)
-    assert "cosine" in VALID_METRICS["brute"]
-    assert "cosine" not in VALID_METRICS["ball_tree"]
+    metric_params = get_dummy_metric_kwargs(metric, n_features)
 
-    # Metric which don't required any additional parameter
-    require_params = ["mahalanobis", "wminkowski", "seuclidean"]
-    for metric in VALID_METRICS["brute"]:
-        if metric != "precomputed" and metric not in require_params:
-            nn = neighbors.NearestNeighbors(
-                n_neighbors=3, algorithm="auto", metric=metric
-            )
-            if metric != "haversine":
-                nn.fit(X)
-                nn.kneighbors(X)
-            else:
-                nn.fit(X[:, :2])
-                nn.kneighbors(X[:, :2])
-        elif metric == "precomputed":
-            X_precomputed = rng.random_sample((10, 4))
-            Y_precomputed = rng.random_sample((3, 4))
-            DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean")
-            DYX = metrics.pairwise_distances(
-                Y_precomputed, X_precomputed, metric="euclidean"
-            )
-            nb_p = neighbors.NearestNeighbors(n_neighbors=3)
-            nb_p.fit(DXX)
-            nb_p.kneighbors(DYX)
+    if metric == "precomputed":
+        X_precomputed = rng.random_sample((10, 4))
+        Y_precomputed = rng.random_sample((3, 4))
+        DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean")
+        DYX = metrics.pairwise_distances(
+            Y_precomputed, X_precomputed, metric="euclidean"
+        )
+        nb_p = neighbors.NearestNeighbors(n_neighbors=3, metric="precomputed")
+        nb_p.fit(DXX)
+        nb_p.kneighbors(DYX)
 
-    for metric in VALID_METRICS_SPARSE["brute"]:
-        if metric != "precomputed" and metric not in require_params:
+    else:
+        nn = neighbors.NearestNeighbors(
+            n_neighbors=3, algorithm="auto", metric=metric, metric_params=metric_params
+        )
+        # Haversine distance only accepts 2D data
+        if metric == "haversine":
+            feature_sl = slice(None, 2)
+            X = np.ascontiguousarray(X[:, feature_sl])
+        else:
+            X = X
+
+        nn.fit(X)
+        nn.kneighbors(X)
+
+        if metric in VALID_METRICS_SPARSE["brute"]:
             nn = neighbors.NearestNeighbors(
                 n_neighbors=3, algorithm="auto", metric=metric
             ).fit(Xcsr)
             nn.kneighbors(Xcsr)
 
-    # Metric with parameter
-    VI = np.dot(X, X.T)
-    list_metrics = [
-        ("seuclidean", dict(V=rng.rand(12))),
-        ("wminkowski", dict(w=rng.rand(12))),
-        ("mahalanobis", dict(VI=VI)),
-    ]
-    for metric, params in list_metrics:
-        nn = neighbors.NearestNeighbors(
-            n_neighbors=3, algorithm="auto", metric=metric, metric_params=params
-        ).fit(X)
-        nn.kneighbors(X)
-
 
 def test_metric_params_interface():
     X = rng.rand(5, 5)
@@ -1513,82 +1757,86 @@ def test_k_and_radius_neighbors_train_is_not_query():
         assert_array_equal(rng.A, [[0, 1], [1, 1]])
 
 
-def test_k_and_radius_neighbors_X_None():
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_k_and_radius_neighbors_X_None(algorithm):
     # Test kneighbors et.al when query is None
-    for algorithm in ALGORITHMS:
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
+
+    X = [[0], [1]]
+    nn.fit(X)
+
+    dist, ind = nn.kneighbors()
+    assert_array_equal(dist, [[1], [1]])
+    assert_array_equal(ind, [[1], [0]])
+    dist, ind = nn.radius_neighbors(None, radius=1.5)
+    check_object_arrays(dist, [[1], [1]])
+    check_object_arrays(ind, [[1], [0]])
+
+    # Test the graph variants.
+    rng = nn.radius_neighbors_graph(None, radius=1.5)
+    kng = nn.kneighbors_graph(None)
+    for graph in [rng, kng]:
+        assert_array_equal(graph.A, [[0, 1], [1, 0]])
+        assert_array_equal(graph.data, [1, 1])
+        assert_array_equal(graph.indices, [1, 0])
+
+    X = [[0, 1], [0, 1], [1, 1]]
+    nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm)
+    nn.fit(X)
+    assert_array_equal(
+        nn.kneighbors_graph().A,
+        np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]),
+    )
 
-        nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
 
-        X = [[0], [1]]
-        nn.fit(X)
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_k_and_radius_neighbors_duplicates(algorithm):
+    # Test behavior of kneighbors when duplicates are present in query
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
+    duplicates = [[0], [1], [3]]
 
-        dist, ind = nn.kneighbors()
-        assert_array_equal(dist, [[1], [1]])
-        assert_array_equal(ind, [[1], [0]])
-        dist, ind = nn.radius_neighbors(None, radius=1.5)
-        check_object_arrays(dist, [[1], [1]])
-        check_object_arrays(ind, [[1], [0]])
+    nn.fit(duplicates)
 
-        # Test the graph variants.
-        rng = nn.radius_neighbors_graph(None, radius=1.5)
-        kng = nn.kneighbors_graph(None)
-        for graph in [rng, kng]:
-            assert_array_equal(graph.A, [[0, 1], [1, 0]])
-            assert_array_equal(graph.data, [1, 1])
-            assert_array_equal(graph.indices, [1, 0])
-
-        X = [[0, 1], [0, 1], [1, 1]]
-        nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm)
-        nn.fit(X)
-        assert_array_equal(
-            nn.kneighbors_graph().A,
-            np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]),
-        )
+    # Do not do anything special to duplicates.
+    kng = nn.kneighbors_graph(duplicates, mode="distance")
+    assert_allclose(
+        kng.toarray(), np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
+    )
+    assert_allclose(kng.data, [0.0, 0.0, 0.0])
+    assert_allclose(kng.indices, [0, 1, 2])
 
+    dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5)
+    check_object_arrays(dist, [[0, 1], [1, 0]])
+    check_object_arrays(ind, [[0, 1], [0, 1]])
 
-def test_k_and_radius_neighbors_duplicates():
-    # Test behavior of kneighbors when duplicates are present in query
+    rng = nn.radius_neighbors_graph(duplicates, radius=1.5)
+    assert_allclose(
+        rng.toarray(), np.array([[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+    )
 
-    for algorithm in ALGORITHMS:
-        nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
-        nn.fit([[0], [1]])
-
-        # Do not do anything special to duplicates.
-        kng = nn.kneighbors_graph([[0], [1]], mode="distance")
-        assert_array_equal(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]]))
-        assert_array_equal(kng.data, [0.0, 0.0])
-        assert_array_equal(kng.indices, [0, 1])
-
-        dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5)
-        check_object_arrays(dist, [[0, 1], [1, 0]])
-        check_object_arrays(ind, [[0, 1], [0, 1]])
-
-        rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5)
-        assert_array_equal(rng.A, np.ones((2, 2)))
-
-        rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance")
-        rng.sort_indices()
-        assert_array_equal(rng.A, [[0, 1], [1, 0]])
-        assert_array_equal(rng.indices, [0, 1, 0, 1])
-        assert_array_equal(rng.data, [0, 1, 1, 0])
-
-        # Mask the first duplicates when n_duplicates > n_neighbors.
-        X = np.ones((3, 1))
-        nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute")
-        nn.fit(X)
-        dist, ind = nn.kneighbors()
-        assert_array_equal(dist, np.zeros((3, 1)))
-        assert_array_equal(ind, [[1], [0], [1]])
-
-        # Test that zeros are explicitly marked in kneighbors_graph.
-        kng = nn.kneighbors_graph(mode="distance")
-        assert_array_equal(kng.A, np.zeros((3, 3)))
-        assert_array_equal(kng.data, np.zeros(3))
-        assert_array_equal(kng.indices, [1.0, 0.0, 1.0])
-        assert_array_equal(
-            nn.kneighbors_graph().A,
-            np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
-        )
+    rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance")
+    rng.sort_indices()
+    assert_allclose(rng.toarray(), [[0, 1, 0], [1, 0, 0]])
+    assert_allclose(rng.indices, [0, 1, 0, 1])
+    assert_allclose(rng.data, [0, 1, 1, 0])
+
+    # Mask the first duplicates when n_duplicates > n_neighbors.
+    X = np.ones((3, 1))
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute")
+    nn.fit(X)
+    dist, ind = nn.kneighbors()
+    assert_allclose(dist, np.zeros((3, 1)))
+    assert_allclose(ind, [[1], [0], [1]])
+
+    # Test that zeros are explicitly marked in kneighbors_graph.
+    kng = nn.kneighbors_graph(mode="distance")
+    assert_allclose(kng.toarray(), np.zeros((3, 3)))
+    assert_allclose(kng.data, np.zeros(3))
+    assert_allclose(kng.indices, [1, 0, 1])
+    assert_allclose(
+        nn.kneighbors_graph().toarray(),
+        np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
+    )
 
 
 def test_include_self_neighbors_graph():
diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py
index de34b4d230171..e043ffb730708 100644
--- a/sklearn/neighbors/tests/test_neighbors_tree.py
+++ b/sklearn/neighbors/tests/test_neighbors_tree.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pytest
 
-from sklearn.neighbors import DistanceMetric
+from sklearn.metrics import DistanceMetric
 from sklearn.neighbors._ball_tree import (
     BallTree,
     kernel_norm,
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 8290318d35deb..d87b5da52339c 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -3,6 +3,7 @@
 """
 import pkgutil
 import inspect
+from distutils.version import LooseVersion
 from importlib import import_module
 from operator import itemgetter
 from collections.abc import Sequence
@@ -19,6 +20,7 @@
 import warnings
 import numpy as np
 from scipy.sparse import issparse
+from threadpoolctl import threadpool_info
 
 from .murmurhash import murmurhash3_32
 from .class_weight import compute_class_weight, compute_sample_weight
@@ -80,6 +82,39 @@
 _IS_32BIT = 8 * struct.calcsize("P") == 32
 
 
+def _in_unstable_openblas_configuration():
+    """Return True if in an unstable configuration for OpenBLAS"""
+
+    # Import libraries which might load OpenBLAS.
+    import numpy  # noqa
+    import scipy  # noqa
+
+    modules_info = threadpool_info()
+
+    open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
+    if not open_blas_used:
+        return False
+
+    # OpenBLAS 0.3.16 fixed unstability for arm64, see:
+    # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa
+    openblas_arm64_stable_version = LooseVersion("0.3.16")
+    for info in modules_info:
+        if info["internal_api"] != "openblas":
+            continue
+        openblas_version = info.get("version")
+        openblas_architecture = info.get("architecture")
+        if openblas_version is None or openblas_architecture is None:
+            # Cannot be sure that OpenBLAS is good enough. Assume unstable:
+            return True
+        if (
+            openblas_architecture == "neoversen1"
+            and openblas_version < openblas_arm64_stable_version
+        ):
+            # See discussions in https://github.com/numpy/numpy/issues/19411
+            return True
+    return False
+
+
 class Bunch(dict):
     """Container object exposing keys as attributes.
 
diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd
new file mode 100644
index 0000000000000..0b65a5a32e393
--- /dev/null
+++ b/sklearn/utils/_heap.pxd
@@ -0,0 +1,19 @@
+# Heap routines, used in various Cython implementation.
+
+from cython cimport floating
+
+from ._typedefs cimport ITYPE_t
+
+cdef int simultaneous_sort(
+    floating* dist,
+    ITYPE_t* idx,
+    ITYPE_t size
+) nogil
+
+cdef int heap_push(
+    floating* values,
+    ITYPE_t* indices,
+    ITYPE_t size,
+    floating val,
+    ITYPE_t val_idx,
+) nogil
diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx
new file mode 100644
index 0000000000000..d6133eab7c658
--- /dev/null
+++ b/sklearn/utils/_heap.pyx
@@ -0,0 +1,144 @@
+#!python
+# cython: boundscheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: wraparound=False
+
+
+from cython cimport floating, integral, numeric
+
+from ._typedefs cimport ITYPE_t
+
+cdef inline void dual_swap(floating* darr, ITYPE_t* iarr,
+                           ITYPE_t i1, ITYPE_t i2) nogil:
+    """Swap the values at inex i1 and i2 of both darr and iarr"""
+    cdef floating dtmp = darr[i1]
+    darr[i1] = darr[i2]
+    darr[i2] = dtmp
+
+    cdef ITYPE_t itmp = iarr[i1]
+    iarr[i1] = iarr[i2]
+    iarr[i2] = itmp
+
+cdef int simultaneous_sort(
+    floating* values,
+    ITYPE_t* indices,
+    ITYPE_t size
+) nogil:
+    """
+    Perform a recursive quicksort on the values array, simultaneously
+    performing the same swaps on the indices array.
+    """
+    # TODO: In order to support discrete distance metrics, we need to have a
+    # simultaneous sort which breaks ties on indices when distances are identical.
+    # The best might be using a std::sort and a Comparator which might need
+    # AoS instead of SoA (currently used).
+    cdef:
+        ITYPE_t pivot_idx, i, store_idx
+        floating pivot_val
+
+    # in the small-array case, do things efficiently
+    if size <= 1:
+        pass
+    elif size == 2:
+        if values[0] > values[1]:
+            dual_swap(values, indices, 0, 1)
+    elif size == 3:
+        if values[0] > values[1]:
+            dual_swap(values, indices, 0, 1)
+        if values[1] > values[2]:
+            dual_swap(values, indices, 1, 2)
+            if values[0] > values[1]:
+                dual_swap(values, indices, 0, 1)
+    else:
+        # Determine the pivot using the median-of-three rule.
+        # The smallest of the three is moved to the beginning of the array,
+        # the middle (the pivot value) is moved to the end, and the largest
+        # is moved to the pivot index.
+        pivot_idx = size // 2
+        if values[0] > values[size - 1]:
+            dual_swap(values, indices, 0, size - 1)
+        if values[size - 1] > values[pivot_idx]:
+            dual_swap(values, indices, size - 1, pivot_idx)
+            if values[0] > values[size - 1]:
+                dual_swap(values, indices, 0, size - 1)
+        pivot_val = values[size - 1]
+
+        # partition indices about pivot.  At the end of this operation,
+        # pivot_idx will contain the pivot value, everything to the left
+        # will be smaller, and everything to the right will be larger.
+        store_idx = 0
+        for i in range(size - 1):
+            if values[i] < pivot_val:
+                dual_swap(values, indices, i, store_idx)
+                store_idx += 1
+        dual_swap(values, indices, store_idx, size - 1)
+        pivot_idx = store_idx
+
+        # recursively sort each side of the pivot
+        if pivot_idx > 1:
+            simultaneous_sort(values, indices, pivot_idx)
+        if pivot_idx + 2 < size:
+            simultaneous_sort(values + pivot_idx + 1,
+                               indices + pivot_idx + 1,
+                               size - pivot_idx - 1)
+    return 0
+
+
+cdef inline int heap_push(
+    floating* values,
+    ITYPE_t* indices,
+    ITYPE_t size,
+    floating val,
+    ITYPE_t val_idx,
+) nogil:
+    """Push a tuple (val, val_idx) into a fixed-size max-heap.
+
+    The max-heap is represented as a struct of arrays where:
+     - values is the array containing the data to construct the heap on
+     - indices is the array containing the indices (meta-data) of each value.
+    """
+    cdef:
+        ITYPE_t current_idx, left_child_idx, right_child_idx, swap_idx
+
+    # check if val should be in heap
+    if val >= values[0]:
+        return 0
+
+    # insert val at position zero
+    values[0] = val
+    indices[0] = val_idx
+
+    # descend the heap, swapping values until the max heap criterion is met
+    current_idx = 0
+    while True:
+        left_child_idx = 2 * current_idx + 1
+        right_child_idx = left_child_idx + 1
+
+        if left_child_idx >= size:
+            break
+        elif right_child_idx >= size:
+            if values[left_child_idx] > val:
+                swap_idx = left_child_idx
+            else:
+                break
+        elif values[left_child_idx] >= values[right_child_idx]:
+            if val < values[left_child_idx]:
+                swap_idx = left_child_idx
+            else:
+                break
+        else:
+            if val < values[right_child_idx]:
+                swap_idx = right_child_idx
+            else:
+                break
+
+        values[current_idx] = values[swap_idx]
+        indices[current_idx] = indices[swap_idx]
+
+        current_idx = swap_idx
+
+    values[current_idx] = val
+    indices[current_idx] = val_idx
+
+    return 0
diff --git a/sklearn/utils/_openmp_helpers.pxd b/sklearn/utils/_openmp_helpers.pxd
new file mode 100644
index 0000000000000..e57fc9bfa6bf5
--- /dev/null
+++ b/sklearn/utils/_openmp_helpers.pxd
@@ -0,0 +1,6 @@
+# Helpers to access OpenMP threads information
+#
+# Those interfaces act as indirections which allows the non-support of OpenMP
+# for implementations which have been written for it.
+
+cdef int _openmp_thread_num() nogil
diff --git a/sklearn/utils/_openmp_helpers.pyx b/sklearn/utils/_openmp_helpers.pyx
index fb8920074a84e..cddd77ac42746 100644
--- a/sklearn/utils/_openmp_helpers.pyx
+++ b/sklearn/utils/_openmp_helpers.pyx
@@ -6,7 +6,7 @@ IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
 
 def _openmp_parallelism_enabled():
     """Determines whether scikit-learn has been built with OpenMP
-    
+
     It allows to retrieve at runtime the information gathered at compile time.
     """
     # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time during
@@ -22,7 +22,7 @@ cpdef _openmp_effective_n_threads(n_threads=None):
       - if the ``OMP_NUM_THREADS`` environment variable is set, return
         ``openmp.omp_get_max_threads()``
       - otherwise, return the minimum between ``openmp.omp_get_max_threads()``
-        and the number of cpus, taking cgroups quotas into account. Cgroups 
+        and the number of cpus, taking cgroups quotas into account. Cgroups
         quotas can typically be set by tools such as Docker.
       The result of ``omp_get_max_threads`` can be influenced by environment
       variable ``OMP_NUM_THREADS`` or at runtime by ``omp_set_num_threads``.
@@ -59,4 +59,13 @@ cpdef _openmp_effective_n_threads(n_threads=None):
         # OpenMP disabled at build-time => sequential mode
         return 1
 
-    
+
+cdef inline int _openmp_thread_num() nogil:
+    """Return the number of the thread calling this function.
+
+    If scikit-learn is built without OpenMP support, always return 0.
+    """
+    IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
+        return openmp.omp_get_thread_num()
+    ELSE:
+        return 0
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 1e4ecdd53e136..18f45d2680b13 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -48,7 +48,12 @@
 import joblib
 
 import sklearn
-from sklearn.utils import IS_PYPY, _IS_32BIT, deprecated
+from sklearn.utils import (
+    IS_PYPY,
+    _IS_32BIT,
+    deprecated,
+    _in_unstable_openblas_configuration,
+)
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.utils.validation import (
     check_array,
@@ -448,6 +453,10 @@ def set_random_state(estimator, random_state=0):
         os.environ.get("TRAVIS") == "true", reason="skip on travis"
     )
     fails_if_pypy = pytest.mark.xfail(IS_PYPY, reason="not compatible with PyPy")
+    fails_if_unstable_openblas = pytest.mark.xfail(
+        _in_unstable_openblas_configuration(),
+        reason="OpenBLAS is unstable for this configuration",
+    )
     skip_if_no_parallel = pytest.mark.skipif(
         not joblib.parallel.mp, reason="joblib is in serial mode"
     )
@@ -1041,3 +1050,24 @@ def transform(self, X, y=None):
 
     def fit_transform(self, X, y=None):
         return self.fit(X, y).transform(X, y)
+
+
+def get_dummy_metric_kwargs(metric: str, n_features: int):
+    """Return dummy DistanceMetric kwargs for tests."""
+    rng = np.random.RandomState(1)
+    weights = rng.random_sample(n_features)
+    weights /= weights.sum()
+
+    V = rng.random_sample((n_features, n_features))
+
+    # VI is positive-semidefinite, preferred for precision matrix
+    VI = np.dot(V, V.T) + 3 * np.eye(n_features)
+
+    kwargs = {
+        "minkowski": dict(p=1.5),
+        "seuclidean": dict(V=weights),
+        "wminkowski": dict(p=1.5, w=weights),
+        "mahalanobis": dict(VI=VI),
+    }
+
+    return kwargs.get(metric, {})
diff --git a/sklearn/neighbors/_typedefs.pxd b/sklearn/utils/_typedefs.pxd
similarity index 100%
rename from sklearn/neighbors/_typedefs.pxd
rename to sklearn/utils/_typedefs.pxd
diff --git a/sklearn/neighbors/_typedefs.pyx b/sklearn/utils/_typedefs.pyx
similarity index 100%
rename from sklearn/neighbors/_typedefs.pyx
rename to sklearn/utils/_typedefs.pyx
diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py
index c75cbe2d86495..6f65a7224d38b 100644
--- a/sklearn/utils/setup.py
+++ b/sklearn/utils/setup.py
@@ -85,6 +85,21 @@ def configuration(parent_package="", top_path=None):
     config.add_extension(
         "_readonly_array_wrapper",
         sources=["_readonly_array_wrapper.pyx"],
+        include_dirs=[numpy.get_include()],
+        libraries=libraries,
+    )
+
+    config.add_extension(
+        "_typedefs",
+        sources=["_typedefs.pyx"],
+        include_dirs=[numpy.get_include()],
+        libraries=libraries,
+    )
+
+    config.add_extension(
+        "_heap",
+        sources=["_heap.pyx"],
+        include_dirs=[numpy.get_include()],
         libraries=libraries,
     )