From de166e03100a33d7b0da900a88eacabbbc47e1e6 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Dec 2021 10:15:33 +0100
Subject: [PATCH 01/22] MAINT Introduce Pairwise Distances Reductions private
 submodule

This introduces the neccessary private implementations for a new
private submodule, i.e.:

 - DatasetsPair, an abstraction to wrap a pair of two datasets and
 compute their vectors pairwise distances
 - DenseDenseDatasetsPair, a first implementation of DatasetsPair
 for pair of two dense datasets
 - PairwiseDistancesReduction, an abstraction allowing computing
 reductions efficiently in parallel and of
 - PairwiseDistancesArgkmin, a first implementation of
 PairwiseDistancesReduction for k-Nearest Neighbors search
---
 sklearn/metrics/_dist_metrics.pxd             |  21 +
 sklearn/metrics/_dist_metrics.pyx             | 188 +++-
 .../metrics/_pairwise_distances_reduction.pyx | 821 ++++++++++++++++++
 sklearn/metrics/setup.py                      |   6 +
 .../test_pairwise_distances_reduction.py      | 379 ++++++++
 sklearn/utils/__init__.py                     |  35 +-
 sklearn/utils/_openmp_helpers.pxd             |   6 +
 sklearn/utils/_openmp_helpers.pyx             |  15 +-
 sklearn/utils/_testing.py                     |  11 +-
 9 files changed, 1468 insertions(+), 14 deletions(-)
 create mode 100644 sklearn/metrics/_pairwise_distances_reduction.pyx
 create mode 100644 sklearn/metrics/tests/test_pairwise_distances_reduction.py
 create mode 100644 sklearn/utils/_openmp_helpers.pxd

diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index 611f6759e2c8b..e7c2f2ea2f926 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -64,3 +64,24 @@ cdef class DistanceMetric:
     cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1
 
     cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1
+
+
+######################################################################
+# DatasetsPair base class
+cdef class DatasetsPair:
+    cdef DistanceMetric distance_metric
+
+    cdef ITYPE_t n_samples_X(self) nogil
+
+    cdef ITYPE_t n_samples_Y(self) nogil
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil
+
+
+cdef class DenseDenseDatasetsPair(DatasetsPair):
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+        ITYPE_t d
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index f7d22c1badfa2..3def08da7965c 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -4,6 +4,8 @@
 
 import numpy as np
 cimport numpy as np
+from cython cimport final
+
 np.import_array()  # required in order to use C-API
 
 
@@ -23,10 +25,10 @@ cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n):
     return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
 
 
-# some handy constants
 from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
 cdef DTYPE_t INF = np.inf
 
+from scipy.sparse import csr_matrix, issparse
 from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
 from ..utils._typedefs import DTYPE, ITYPE
 from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
@@ -67,6 +69,16 @@ METRIC_MAPPING = {'euclidean': EuclideanDistance,
                   'haversine': HaversineDistance,
                   'pyfunc': PyFuncDistance}
 
+BOOL_METRICS = [
+    "matching",
+    "jaccard",
+    "dice",
+    "kulsinski",
+    "rogerstanimoto",
+    "russellrao",
+    "sokalmichener",
+    "sokalsneath",
+]
 
 def get_valid_metric_ids(L):
     """Given an iterable of metric class names or class identifiers,
@@ -195,8 +207,8 @@ cdef class DistanceMetric:
     """
     def __cinit__(self):
         self.p = 2
-        self.vec = np.zeros(1, dtype=DTYPE, order='c')
-        self.mat = np.zeros((1, 1), dtype=DTYPE, order='c')
+        self.vec = np.zeros(1, dtype=DTYPE, order='C')
+        self.mat = np.zeros((1, 1), dtype=DTYPE, order='C')
         self.size = 1
 
     def __reduce__(self):
@@ -306,8 +318,9 @@ cdef class DistanceMetric:
         This can optionally be overridden in a base class.
 
         The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute. For example, for the
-        Euclidean metric, the surrogate distance is the squared-euclidean distance.
+        rank as the distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
         """
         return self.dist(x1, x2, size)
 
@@ -343,8 +356,9 @@ cdef class DistanceMetric:
         """Convert the rank-preserving surrogate distance to the distance.
 
         The surrogate distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute. For example, for the
-        Euclidean metric, the surrogate distance is the squared-euclidean distance.
+        distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
 
         Parameters
         ----------
@@ -362,8 +376,9 @@ cdef class DistanceMetric:
         """Convert the true distance to the rank-preserving surrogate distance.
 
         The surrogate distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute. For example, for the
-        Euclidean metric, the surrogate distance is the squared-euclidean distance.
+        distance, but is more efficient to compute. For example, the
+        rank-preserving surrogate distance of the Euclidean metric is the
+        squared-euclidean distance.
 
         Parameters
         ----------
@@ -1150,3 +1165,158 @@ cdef class PyFuncDistance(DistanceMetric):
 
 cdef inline double fmax(double a, double b) nogil:
     return max(a, b)
+
+
+######################################################################
+# Datasets Pair Classes
+cdef class DatasetsPair:
+    """Abstract class which wraps a pair of datasets (X, Y).
+
+    This class allows computing distances between a single pair of rows of
+    of X and Y at a time given the pair of their indices (i, j). This class is
+    specialized for each metric thanks to the :func:`get_for` factory classmethod.
+
+    The handling of parallelization over chunks to compute the distances
+    and aggregation for several rows at a time is done in dedicated
+    subclasses of PairwiseDistancesReduction that in-turn rely on
+    subclasses of DatasetsPair for each pair of rows in the data. The goal
+    is to make it possible to decouple the generic parallelization and
+    aggregation logic from metric-specific computation as much as
+    possible.
+
+    X and Y can be stored as np.ndarrays or CSR matrices in subclasses.
+
+    This class avoids the overhead of dispatching distance computations
+    to :class:`sklearn.metrics.DistanceMetric` based on the physical
+    representation of the vectors (sparse vs. dense). It makes use of
+    cython.final to remove the overhead of dispatching method calls.
+
+    Parameters
+    ----------
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        str metric="euclidean",
+        dict metric_kwargs=None,
+    ) -> DatasetsPair:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+            If provided as a sparse matrix, it must be in CSR format.
+
+        metric : str, default='euclidean'
+            The distance metric to use for argkmin. The default metric is
+            a fast implementation of the standard Euclidean metric.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        Returns
+        -------
+        datasets_pair: DatasetsPair
+            The suited DatasetsPair implementation.
+        """
+        cdef:
+            DistanceMetric distance_metric = DistanceMetric.get_metric(
+                metric,
+                **(metric_kwargs or {})
+            )
+
+        if X.dtype != np.float64 or Y.dtype != np.float64:
+            raise ValueError("Only 64bit float datasets are supported for X and Y.")
+
+        # Metric-specific checks that do not replace nor duplicate `check_array`.
+        distance_metric._validate_data(X)
+        distance_metric._validate_data(Y)
+
+        if issparse(X) or issparse(Y):
+            raise ValueError("Only dense datasets are supported for X and Y.")
+
+        return DenseDenseDatasetsPair(X, Y, distance_metric)
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure getting ITYPE instead of int internally used for CSR matrices."""
+        X_data = np.asarray(X.data, dtype=DTYPE)
+        X_indices = np.asarray(X.indices, dtype=ITYPE)
+        X_indptr = np.asarray(X.indptr, dtype=ITYPE)
+        return X_data, X_indptr, X_indptr
+
+    def __init__(self, DistanceMetric distance_metric):
+        self.distance_metric = distance_metric
+
+    cdef ITYPE_t n_samples_X(self) nogil:
+        """Number of samples in X."""
+        return -999
+
+    cdef ITYPE_t n_samples_Y(self) nogil:
+        """Number of samples in Y."""
+        return -999
+
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.dist(i, j)
+
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return -1
+
+@final
+cdef class DenseDenseDatasetsPair(DatasetsPair):
+    """Compute distances between vectors of two arrays.
+
+    Parameters
+    ----------
+    X: ndarray of shape (n_samples_X, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    Y: ndarray of shape (n_samples_Y, n_features)
+        Rows represent vectors. Must be C-contiguous.
+
+    distance_metric: DistanceMetric
+        The distance metric responsible for computing distances
+        between two vectors of (X, Y).
+    """
+
+    def __init__(self, X, Y, DistanceMetric distance_metric):
+        super().__init__(distance_metric)
+        # Arrays have already been checked
+        self.X = X
+        self.Y = Y
+        self.d = X.shape[1]
+
+    @final
+    cdef ITYPE_t n_samples_X(self) nogil:
+        return self.X.shape[0]
+
+    @final
+    cdef ITYPE_t n_samples_Y(self) nogil:
+        return self.Y.shape[0]
+
+    @final
+    cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.distance_metric.rdist(&self.X[i, 0],
+                                          &self.Y[j, 0],
+                                          self.d)
+
+    @final
+    cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        return self.distance_metric.dist(&self.X[i, 0],
+                                         &self.Y[j, 0],
+                                         self.d)
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
new file mode 100644
index 0000000000000..d08b81d48a58c
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -0,0 +1,821 @@
+# Pairwise Distances Reductions
+# =============================
+#
+#    Author: Julien Jerphanion <git@jjerphan.xyz>
+#
+#
+# The routines defined here are used in various algorithms performing
+# the same structure of operations on distances between vectors
+# of a datasets pair (X, Y).
+#
+# Importantly, the core of the computation is chunked to make sure that the pairwise
+# distance chunk matrices stay in CPU cache before applying the final reduction step.
+# Furthermore, the chunking strategy is also used to leverage OpenMP-based parallelism
+# (using Cython prange loops) which gives another multiplicative speed-up in
+# favorable cases on many-core machines.
+cimport numpy as np
+import numpy as np
+import warnings
+import scipy.sparse
+
+from .. import get_config
+from libc.stdlib cimport free, malloc
+from libc.float cimport DBL_MAX
+from cython cimport final
+from cython.parallel cimport parallel, prange
+
+from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair
+from ..utils._cython_blas cimport (
+  BLAS_Order,
+  BLAS_Trans,
+  ColMajor,
+  NoTrans,
+  RowMajor,
+  Trans,
+  _dot,
+  _gemm,
+)
+from ..utils._heap cimport simultaneous_sort, heap_push
+from ..utils._openmp_helpers cimport _openmp_thread_num
+from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
+
+from numbers import Integral, Real
+from typing import List
+from scipy.sparse import issparse
+from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
+from ..utils import check_scalar, _in_unstable_openblas_configuration
+from ..utils.fixes import threadpool_limits
+from ..utils._openmp_helpers import _openmp_effective_n_threads
+from ..utils._typedefs import ITYPE, DTYPE
+
+np.import_array()
+
+cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
+    const DTYPE_t[:, ::1] X,
+    ITYPE_t num_threads,
+):
+    """Compute the squared euclidean norm of the rows of X in parallel.
+
+    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
+    """
+    cdef:
+        # Casting for X to remove the const qualifier is needed because APIs
+        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
+        # const qualifier.
+        DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
+        ITYPE_t idx = 0
+        ITYPE_t n = X.shape[0]
+        ITYPE_t d = X.shape[1]
+        DTYPE_t[::1] row_norms = np.empty(n, dtype=DTYPE)
+
+    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
+        row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
+
+    return row_norms
+
+cdef class PairwiseDistancesReduction:
+    """Abstract base class for pairwise distance computation & reduction
+
+    Subclasses of this class compute pairwise distances between a set of
+    vectors (rows) X and another set of vectors (rows) Y and apply a
+    reduction on top. The reduction takes a matrix of pairwise distances
+    between rows of X and Y as input and outputs an aggregate data-structure
+    for each row of X. The aggregate values are typically smaller than the number
+    of rows in Y, hence the term reduction.
+
+    For computational reasons, it is interesting to perform the reduction on
+    the fly on chunks of rows of X and Y so as to keep intermediate
+    data-structures in CPU cache and avoid unnecessary round trips of large
+    distance arrays with the RAM that would otherwise severely degrade the
+    speed by making the overall processing memory-bound.
+
+    The base class provides the generic chunked parallelization template using
+    OpenMP loops (Cython prange), either on rows of X or rows of Y depending on
+    their respective sizes.
+
+    The subclasses are specialized for reduction.
+
+    The actual distance computation for a given pair of rows of X and Y are
+    delegated to format-specific subclasses of the DatasetsPair companion base
+    class.
+
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The pair of dataset to use.
+
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use 256 if it is not set.
+
+    n_threads: int, default=None
+        The number of OpenMP threads to use for the reduction.
+        Parallelism is done on chunks and the sharding of chunks
+        depends on the `strategy` set on :method:`~PairwiseDistancesReduction.compute`.
+
+        See _openmp_effective_n_threads, for details about
+        the specification of n_threads.
+
+    strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+        The chunking strategy defining which dataset parallelization are made on.
+
+        Strategies differs on the dispatching they use for chunks on threads:
+
+          - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+          Each thread then iterates on all the chunks of Y. This strategy is
+          embarrassingly parallel and comes with no datastructures synchronisation.
+
+          - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+          Each thread then iterates on all the chunks of X. This strategy is
+          embarrassingly parallel but uses intermediate datastructures
+          synchronisation.
+
+          - 'auto' relies on a simple heuristic to choose between
+          'parallel_on_X' and 'parallel_on_Y'.
+
+          - None (default) looks-up in scikit-learn configuration for
+          `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+    """
+
+    cdef:
+        readonly DatasetsPair datasets_pair
+
+        # The number of threads that can be used is stored in effective_n_threads.
+        #
+        # The number of threads to use in the parallelisation strategy
+        # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads:
+        # for small datasets, less threads might be needed to loop over pair of chunks.
+        #
+        # Hence the number of threads that _will_ be used for looping over chunks
+        # is stored in chunks_n_threads, allowing solely using what we need.
+        #
+        # Thus, an invariant is:
+        #
+        #                 chunks_n_threads <= effective_n_threads
+        #
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+
+        ITYPE_t n_samples_chunk, chunk_size
+
+        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder
+        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder
+
+        bint execute_in_parallel_on_Y
+
+    @classmethod
+    def valid_metrics(cls) -> List[str]:
+        excluded = {
+            "pyfunc",  # is relatively slow because we need to coerce data as np arrays
+            "mahalanobis", # is numerically unstable
+            # TODO: In order to support discrete distance metrics, we need to have a
+            # simultaneous sort which breaks ties on indices when distances are identical.
+            # The best might be using std::stable_sort and a Comparator taking an
+            # Arrays of Structures instead of Structure of Arrays (currently used).
+            "hamming",
+            *BOOL_METRICS,
+        }
+        return sorted(set(METRIC_MAPPING.keys()).difference(excluded))
+
+    @classmethod
+    def is_usable_for(cls, X, Y, metric) -> bool:
+        """Return True if the PairwiseDistancesReduction can be used for the given parameters.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
+            Input data.
+
+        metric : str, default='euclidean'
+            The distance metric to use.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        Returns
+        -------
+        True if the PairwiseDistancesReduction can be used, else False.
+        """
+        # Coercing to np.array to get the dtype
+        # TODO: what is the best way to get lists' dtype?
+        X = np.asarray(X) if not isinstance(X, (np.ndarray, scipy.sparse.spmatrix)) else X
+        Y = np.asarray(Y) if not isinstance(Y, (np.ndarray, scipy.sparse.spmatrix)) else Y
+        # TODO: support sparse arrays and 32 bits
+        return (not issparse(X) and X.dtype == np.float64 and X.ndim == 2 and
+                not issparse(Y) and Y.dtype == np.float64 and Y.ndim == 2 and
+                metric in cls.valid_metrics())
+
+    def __init__(
+        self,
+        DatasetsPair datasets_pair,
+        chunk_size=None,
+        n_threads=None,
+        strategy=None,
+     ):
+        cdef:
+            ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks
+
+        if chunk_size is None:
+            chunk_size = get_config().get("pairwise_dist_chunk_size", 256)
+
+        self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20)
+
+        self.effective_n_threads = _openmp_effective_n_threads(n_threads)
+
+        self.datasets_pair = datasets_pair
+
+        self.n_samples_X = datasets_pair.n_samples_X()
+        self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size)
+        X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
+        self.X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
+
+        self.n_samples_Y = datasets_pair.n_samples_Y()
+        self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size)
+        Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
+        self.Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
+
+        # Counting remainder chunk in total number of chunks
+        self.X_n_chunks = X_n_full_chunks + (self.X_n_samples_remainder != 0)
+        self.Y_n_chunks = Y_n_full_chunks + (self.Y_n_samples_remainder != 0)
+
+        if strategy is None:
+            strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
+
+        if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'):
+            raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', "
+                               f"or 'auto', but currently strategy='{self.strategy}'.")
+
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            if 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X:
+                strategy = 'parallel_on_X'
+            else:
+                strategy = 'parallel_on_Y'
+
+        self.execute_in_parallel_on_Y = strategy == "parallel_on_Y"
+
+        # Not using less, not using more.
+        self.chunks_n_threads = min(
+            self.Y_n_chunks if self.execute_in_parallel_on_Y else self.X_n_chunks,
+            self.effective_n_threads,
+        )
+
+    @final
+    cdef void _parallel_on_X(self) nogil:
+        """Compute the pairwise distances of each vector (row) of X on Y
+        by parallelizing computation on chunks of X and reduce them.
+
+        This strategy dispatches chunks of X uniformly on threads.
+        Each thread then iterates on all the chunks of Y. This strategy is
+        embarrassingly parallel and comes with no datastructures synchronisation.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            ITYPE_t thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            thread_num = _openmp_thread_num()
+
+            # Allocating thread datastructures
+            self._parallel_on_X_parallel_init(thread_num)
+
+            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
+                X_start = X_chunk_idx * self.X_n_samples_chunk
+                if (X_chunk_idx == self.X_n_chunks - 1
+                    and self.X_n_samples_remainder > 0):
+                    X_end = X_start + self.X_n_samples_remainder
+                else:
+                    X_end = X_start + self.X_n_samples_chunk
+
+                # Reinitializing thread datastructures for the new X chunk
+                self._parallel_on_X_init_chunk(thread_num, X_start)
+
+                for Y_chunk_idx in range(self.Y_n_chunks):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if (Y_chunk_idx == self.Y_n_chunks - 1
+                        and self.Y_n_samples_remainder > 0):
+                        Y_end = Y_start + self.Y_n_samples_remainder
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    self._compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
+                # Adjusting thread datastructures on the full pass on Y
+                self._parallel_on_X_prange_iter_finalize(thread_num, X_start, X_end)
+
+            # end: for X_chunk_idx
+
+            # Deallocating thread datastructures
+            self._parallel_on_X_parallel_finalize(thread_num)
+
+        # end: with nogil, parallel
+        return
+
+    @final
+    cdef void _parallel_on_Y(self) nogil:
+        """Compute the pairwise distances of each vector (row) of X on Y
+        by parallelizing computation on chunks of Y and reduce them.
+
+        This strategy dispatches chunks of Y uniformly on threads.
+        Each thread then iterates on all the chunks of X. This strategy is
+        embarrassingly parallel but uses intermediate datastructures
+        synchronisation.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            ITYPE_t thread_num
+
+        # Allocating datastructures
+        self._parallel_on_Y_parallel_init()
+
+        for X_chunk_idx in range(self.X_n_chunks):
+            X_start = X_chunk_idx * self.X_n_samples_chunk
+            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_remainder > 0:
+                X_end = X_start + self.X_n_samples_remainder
+            else:
+                X_end = X_start + self.X_n_samples_chunk
+
+            with nogil, parallel(num_threads=self.chunks_n_threads):
+                thread_num = _openmp_thread_num()
+
+                # Initializing datastructures used in this thread
+                self._parallel_on_Y_init(thread_num)
+
+                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1 \
+                            and self.Y_n_samples_remainder > 0:
+                        Y_end = Y_start + self.Y_n_samples_remainder
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    self._compute_and_reduce_distances_on_chunks(
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+                # end: prange
+
+                # Note: we don't need a _parallel_on_Y_finalize similarly.
+                # This can be introduced if needed.
+
+            # end: with nogil, parallel
+
+            # Synchronizing the thread datastructures with the main ones
+            self._parallel_on_Y_synchronize(X_start, X_end)
+
+        # end: for X_chunk_idx
+        # Deallocating temporary datastructures and adjusting main datastructures
+        self._parallel_on_Y_finalize()
+        return
+
+    # Placeholder methods which have to be implemented
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Compute the pairwise distances on two chunks of X and Y and reduce them.
+
+        This is the core critical region of PairwiseDistanceReductions' computations
+        which must be implemented in subclasses.
+        """
+        return
+
+    def _finalize_results(self, bint return_distance):
+        """Call-back adapting datastructures before returning results.
+
+        This must be implemented in subclasses.
+        """
+        return None
+
+    # Placeholder methods which can be implemented
+
+    cdef void compute_exact_distances(self) nogil:
+        """Convert rank-preserving distances to exact distances or recompute them."""
+        return
+
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Allocate datastructures used in a thread given its number."""
+        return
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+    ) nogil:
+        """Initialise datastructures used in a thread given its number."""
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        """Interact with datastructures after a reduction on chunks."""
+        return
+
+    cdef void _parallel_on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
+    ) nogil:
+        """Interact with datastructures after executing all the reductions."""
+        return
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+    ) nogil:
+        """Allocate datastructures used in all threads."""
+        return
+
+    cdef void _parallel_on_Y_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        """Initialise datastructures used in a thread given its number."""
+        return
+
+    cdef void _parallel_on_Y_synchronize(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        """Update thread datastructures before leaving a parallel region."""
+        return
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        """Update datastructures after executing all the reductions."""
+        return
+
+cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
+    """Compute the argkmin of vectors (rows) of X on the ones of Y.
+
+    Parameters
+    ----------
+    datasets_pair: DatasetsPair
+        The dataset pairs (X, Y) for the reduction.
+
+    k: int
+        The k for the argkmin reduction.
+
+    chunk_size: int, default=None,
+        The number of vectors per chunk. If None (default) looks-up in
+        scikit-learn configuration for `pairwise_dist_chunk_size`,
+        and use 256 if it is not set.
+
+    n_threads: int, default=None
+        The number of OpenMP threads to use for the reduction.
+        Parallelism is done on chunks and the sharding of chunks
+        depends on the `strategy` set on :method:`~ArgKmin.compute`.
+
+        See _openmp_effective_n_threads, for details about
+        the specification of n_threads.
+    """
+
+    cdef:
+        ITYPE_t k
+
+        ITYPE_t[:, ::1] argkmin_indices
+        DTYPE_t[:, ::1] argkmin_distances
+
+        # Used as array of pointers to private datastructures used in threads.
+        DTYPE_t ** heaps_r_distances_chunks
+        ITYPE_t ** heaps_indices_chunks
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        ITYPE_t k,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        n_threads=None,
+        str strategy=None,
+        bint return_distance=False,
+    ):
+        """Return the results of the reduction for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR matrix of shape (n_samples_X, n_features)
+            Input data.
+
+        Y : ndarray or CSR matrix of shape (n_samples_Y, n_features)
+            Input data.
+
+        k : int
+            The k for the argkmin reduction.
+
+        metric : str, default='euclidean'
+            The distance metric to use for argkmin.
+            For a list of available metrics, see the documentation of
+            :class:`~sklearn.metrics.DistanceMetric`.
+
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+
+        n_threads : int, default=None
+            The number of OpenMP threads to use for the reduction.
+            Parallelism is done on chunks and the sharding of chunks
+            depends on the `strategy` set on
+            :method:`~PairwiseDistancesArgKmin.compute`.
+
+            See _openmp_effective_n_threads, for details about
+            the specification of n_threads.
+
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+
+            Strategies differs on the dispatching they use for chunks on threads:
+
+              - 'parallel_on__X' dispatches chunks of X uniformly on threads.
+              Each thread then iterates on all the chunks of Y. This strategy is
+              embarrassingly parallel and comes with no datastructures synchronisation.
+
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+              Each thread then iterates on all the chunks of X. This strategy is
+              embarrassingly parallel but uses intermediate datastructures
+              synchronisation.
+
+              - 'auto' relies on a simple heuristic to choose between
+              'parallel_on__X' and 'parallel_on_Y'.
+
+              - None (default) looks-up in scikit-learn configuration for
+              `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+
+        return_distance : boolean, default=False
+            Return distances between each X vector and its
+            argkmin if set to True.
+
+        Returns
+        -------
+            Indices of argkmin for each vector in X and its associated distances
+            if return_distance=True.
+
+        Notes
+        -----
+            This public classmethod is responsible for introspecting the arguments
+            values to dispatch to the private :meth:`PairwiseDistancesArgKmin._compute`
+            instance method of the most appropriate :class:`PairwiseDistancesArgKmin`
+            concrete implementation.
+
+            All temporarily allocated datastructures necessary for the concrete
+            implementation are therefore freed when this classmethod returns.
+
+            This allows entirely decoupling the interface entirely from the
+            implementation details whilst maintaining RAII.
+        """
+        # Note (jjerphan): Some design thoughts for future extensions.
+        # This factory comes to handle specialisations for the given arguments.
+        # For future work, this might can be an entrypoint to specialise operations
+        # for various back-end and/or hardware and/or datatypes, and/or fused
+        # {sparse, dense}-datasetspair etc.
+
+        pda = PairwiseDistancesArgKmin(
+            datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs),
+            k=k,
+            chunk_size=chunk_size,
+            strategy=strategy,
+        )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results(return_distance)
+
+    def __init__(
+        self,
+        DatasetsPair datasets_pair,
+        ITYPE_t k,
+        chunk_size=None,
+        n_threads=None,
+        strategy=None,
+    ):
+        super().__init__(datasets_pair, chunk_size, n_threads, strategy)
+
+        self.k = check_scalar(k, "k", Integral, min_val=1)
+
+        # Allocating pointers to datastructures but not the datastructures themselves.
+        # There are as many pointers as effective threads.
+        #
+        # For the sake of explicitness:
+        #   - when parallelizing on X, those heaps pointers are referencing
+        #   (with proper offsets) addresses of the two main heaps (see bellow)
+        #   - when parallelizing on Y, those heaps pointer heaps are referencing
+        #   small heaps which are thread-wise-allocated and whose content will be
+        #   merged with the main heaps'.
+        self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
+            sizeof(DTYPE_t *) * self.chunks_n_threads
+        )
+        self.heaps_indices_chunks = <ITYPE_t **> malloc(
+            sizeof(ITYPE_t *) * self.chunks_n_threads
+        )
+
+        # Main heaps used by PairwiseDistancesArgKmin._compute to return results.
+        self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
+        self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
+
+    def __dealloc__(self):
+        if self.heaps_indices_chunks is not NULL:
+            free(self.heaps_indices_chunks)
+
+        if self.heaps_r_distances_chunks is not NULL:
+            free(self.heaps_r_distances_chunks)
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_samples_X = X_end - X_start
+            ITYPE_t n_samples_Y = Y_end - Y_start
+            DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
+            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
+
+        # Pushing the distance and their associated indices on heaps
+        # which keep tracks of the argkmin.
+        for i in range(n_samples_X):
+            for j in range(n_samples_Y):
+                heap_push(
+                    heaps_r_distances + i * self.k,
+                    heaps_indices + i * self.k,
+                    self.k,
+                    self.datasets_pair.surrogate_dist(X_start + i, Y_start + j),
+                    Y_start + j,
+                )
+
+    @final
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+    ) nogil:
+        # As this strategy is embarrassingly parallel, we can set each
+        # thread's heaps pointer to the proper position on the main heaps.
+        self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0]
+        self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
+
+    @final
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx
+
+        # Sorting indices of the argkmin for each query vector of X
+        for idx in range(X_end - X_start):
+            simultaneous_sort(
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
+                self.k
+            )
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+    ) nogil:
+        cdef:
+            # Maximum number of scalar elements (the last chunks can be smaller)
+            ITYPE_t heaps_size = self.X_n_samples_chunk * self.k
+            ITYPE_t thread_num
+
+        # The allocation is done in parallel for data locality purposes: this way
+        # the heaps used in each threads are allocated in pages which are closer
+        # to processor core used by the thread.
+        for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True,
+                                 num_threads=self.chunks_n_threads):
+            # As chunks of X are shared across threads, so must their
+            # heaps. To solve this, each thread has its own heaps
+            # which are then synchronised back in the main ones.
+            self.heaps_r_distances_chunks[thread_num] = <DTYPE_t *> malloc(
+                heaps_size * sizeof(DTYPE_t)
+            )
+            self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(
+                heaps_size * sizeof(ITYPE_t)
+            )
+
+    @final
+    cdef void _parallel_on_Y_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Initialising heaps (memset can't be used here)
+        for idx in range(self.X_n_samples_chunk * self.k):
+            self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX
+            self.heaps_indices_chunks[thread_num][idx] = -1
+
+    @final
+    cdef void _parallel_on_Y_synchronize(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx, thread_num
+        with nogil, parallel(num_threads=self.effective_n_threads):
+            # Synchronising the thread heaps with the main heaps.
+            # This is done in parallel sample-wise (no need for locks).
+            # This might break each thread's data locality a bit but
+            # but this is negligible and this parallel pattern has
+            # shown to be efficient in practice.
+            for idx in prange(X_end - X_start, schedule="static"):
+                for thread_num in range(self.chunks_n_threads):
+                    for jdx in range(self.k):
+                        heap_push(
+                            &self.argkmin_distances[X_start + idx, 0],
+                            &self.argkmin_indices[X_start + idx, 0],
+                            self.k,
+                            self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx],
+                            self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
+                        )
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            # Deallocating temporary datastructures
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sort the main heaps into arrays in parallel
+            # in ascending order w.r.t the distances
+            for idx in prange(self.n_samples_X, schedule='static'):
+                simultaneous_sort(
+                    &self.argkmin_distances[idx, 0],
+                    &self.argkmin_indices[idx, 0],
+                    self.k,
+                )
+        return
+
+    cdef void compute_exact_distances(self) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t[:, ::1] Y_indices = self.argkmin_indices
+            DTYPE_t[:, ::1] distances = self.argkmin_distances
+        for i in prange(self.n_samples_X, schedule='static', nogil=True,
+                        num_threads=self.effective_n_threads):
+            for j in range(self.k):
+                distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist(
+                    # Guard against eventual -0., causing nan production.
+                    max(distances[i, j], 0.)
+                )
+
+    def _finalize_results(self, bint return_distance=False):
+        if return_distance:
+            # We need to recompute distances because we relied on
+            # surrogate distances for the reduction.
+            self.compute_exact_distances()
+            return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
+
+        return np.asarray(self.argkmin_indices)
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 69925a3590be6..2bf582506922e 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -26,6 +26,12 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
+    config.add_extension(
+        "_pairwise_distances_reduction",
+        sources=["_pairwise_distances_reduction.pyx"],
+        libraries=libraries,
+    )
+
     config.add_subpackage("tests")
 
     return config
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
new file mode 100644
index 0000000000000..c6efeb8259a20
--- /dev/null
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -0,0 +1,379 @@
+import numpy as np
+import pytest
+from collections import defaultdict
+from numpy.testing import assert_array_equal, assert_allclose
+from scipy.sparse import csr_matrix
+
+from sklearn.metrics._pairwise_distances_reduction import (
+    PairwiseDistancesReduction,
+    PairwiseDistancesArgKmin,
+    _sqeuclidean_row_norms,
+)
+
+from sklearn.utils import _in_unstable_openblas_configuration
+from sklearn.utils.fixes import sp_version, parse_version
+from sklearn.utils._testing import fails_if_unstable_openblas
+
+
+def _get_dummy_metric_params_list(metric: str, n_features: int):
+    """Return list of dummy DistanceMetric kwargs for tests."""
+
+    rng = np.random.RandomState(1)
+    weights = rng.random_sample(n_features)
+    weights /= weights.sum()
+
+    V = rng.random_sample((n_features, n_features))
+
+    # VI is positive-semidefinite, preferred for precision matrix
+    VI = np.dot(V, V.T) + 3 * np.eye(n_features)
+
+    METRICS_PARAMS = defaultdict(
+        list,
+        {
+            "euclidean": [{}],
+            "manhattan": [{}],
+            "minkowski": [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)],
+            "chebyshev": [{}],
+            "seuclidean": [dict(V=rng.rand(n_features))],
+            "haversine": [{}],
+            "wminkowski": [dict(p=1.5, w=weights)],
+            "mahalanobis": [dict(VI=VI)],
+        },
+    )
+
+    wminkowski_kwargs = dict(p=3, w=rng.rand(n_features))
+
+    if sp_version < parse_version("1.8.0.dev0"):
+        # TODO: remove once we no longer support scipy < 1.8.0.
+        # wminkowski was removed in scipy 1.8.0 but should work for previous
+        # versions.
+        METRICS_PARAMS["wminkowski"].append(wminkowski_kwargs)  # type: ignore
+    else:
+        # Recent scipy versions accept weights in the Minkowski metric directly:
+        # type: ignore
+        METRICS_PARAMS["minkowski"].append(wminkowski_kwargs)  # type: ignore
+
+    return METRICS_PARAMS.get(metric, [{}])
+
+
+def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):
+    assert_array_equal(
+        ref_indices,
+        indices,
+        err_msg="Query vectors have different neighbors' indices",
+    )
+    assert_allclose(
+        ref_dist,
+        dist,
+        err_msg="Query vectors have different neighbors' distances",
+        rtol=1e-7,
+    )
+
+
+ASSERT_RESULT = {
+    PairwiseDistancesArgKmin: assert_argkmin_results_equality,
+}
+
+
+def test_pairwise_distances_reduction_is_usable_for():
+    rng = np.random.RandomState(0)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    metric = "euclidean"
+    assert PairwiseDistancesReduction.is_usable_for(X, Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(
+        X.astype(np.int64), Y.astype(np.int64), metric
+    )
+
+    assert not PairwiseDistancesReduction.is_usable_for(X[0], Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(X, Y[0], metric)
+
+    assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc")
+    # TODO: remove once 32 bits datasets are supported
+    assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(X, Y.astype(np.int32), metric)
+
+    # TODO: remove once sparse matrices are supported
+    assert not PairwiseDistancesReduction.is_usable_for(csr_matrix(X), Y, metric)
+    assert not PairwiseDistancesReduction.is_usable_for(X, csr_matrix(Y), metric)
+
+
+def test_argkmin_factory_method_wrong_usages():
+    rng = np.random.RandomState(1)
+    X = rng.rand(100, 10)
+    Y = rng.rand(100, 10)
+    k = 5
+    metric = "euclidean"
+
+    with pytest.raises(
+        ValueError, match="Only 64bit float datasets are supported for X and Y."
+    ):
+        PairwiseDistancesArgKmin.compute(
+            X=X.astype(np.float32), Y=Y, k=k, metric=metric
+        )
+
+    with pytest.raises(
+        ValueError, match="Only 64bit float datasets are supported for X and Y."
+    ):
+        PairwiseDistancesArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
+
+    with pytest.raises(ValueError, match="k == -1, must be >= 1."):
+        PairwiseDistancesArgKmin.compute(X=X, Y=Y, k=-1, metric=metric)
+
+    with pytest.raises(ValueError, match="k == 0, must be >= 1."):
+        PairwiseDistancesArgKmin.compute(X=X, Y=Y, k=0, metric=metric)
+
+    with pytest.raises(ValueError, match="Unrecognized metric"):
+        PairwiseDistancesArgKmin.compute(X=X, Y=Y, k=k, metric="wrong metric")
+
+    with pytest.raises(
+        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
+    ):
+        PairwiseDistancesArgKmin.compute(
+            X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric
+        )
+
+    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
+        PairwiseDistancesArgKmin.compute(
+            X=np.asfortranarray(X), Y=Y, k=k, metric=metric
+        )
+
+
+@fails_if_unstable_openblas
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("n_samples", [100, 1000])
+@pytest.mark.parametrize("chunk_size", [50, 512, 1024])
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin],
+)
+def test_chunk_size_agnosticism(
+    PairwiseDistancesReduction,
+    seed,
+    n_samples,
+    chunk_size,
+    n_features=100,
+    dtype=np.float64,
+):
+    # Results should not depend on the chunk size
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    parameter = (
+        10
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
+        # Scaling the radius slightly with the numbers of dimensions
+        else 10 ** np.log(n_features)
+    )
+
+    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        return_distance=True,
+    )
+
+    dist, indices = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        chunk_size=chunk_size,
+        return_distance=True,
+    )
+
+    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
+
+
+@fails_if_unstable_openblas
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("n_samples", [100, 1000])
+@pytest.mark.parametrize("chunk_size", [50, 512, 1024])
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin],
+)
+def test_n_threads_agnosticism(
+    PairwiseDistancesReduction,
+    seed,
+    n_samples,
+    chunk_size,
+    n_features=100,
+    dtype=np.float64,
+):
+    # Results should not depend on the number of threads
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    parameter = (
+        10
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
+        # Scaling the radius slightly with the numbers of dimensions
+        else 10 ** np.log(n_features)
+    )
+
+    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        return_distance=True,
+    )
+
+    dist, indices = PairwiseDistancesReduction.compute(
+        X, Y, parameter, n_threads=1, return_distance=True
+    )
+
+    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
+
+
+@pytest.mark.parametrize("seed", range(5))
+@pytest.mark.parametrize("n_samples", [100, 1000])
+@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin],
+)
+def test_strategies_consistency(
+    PairwiseDistancesReduction,
+    metric,
+    n_samples,
+    seed,
+    n_features=10,
+    dtype=np.float64,
+):
+    # Results obtained using both parallelization strategies must be identical
+    if _in_unstable_openblas_configuration() and metric in ("sqeuclidean", "euclidean"):
+        pytest.xfail(
+            "OpenBLAS (used for '(sq)euclidean') is unstable in this configuration"
+        )
+
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        X = np.ascontiguousarray(X[:, :2])
+        Y = np.ascontiguousarray(Y[:, :2])
+
+    parameter = (
+        10
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
+        # Scaling the radius slightly with the numbers of dimensions
+        else 10 ** np.log(n_features)
+    )
+
+    dist_par_X, indices_par_X = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        # Taking the first
+        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
+        # To be sure to use parallelization
+        chunk_size=n_samples // 4,
+        strategy="parallel_on_X",
+        return_distance=True,
+    )
+
+    dist_par_Y, indices_par_Y = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        # Taking the first
+        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
+        # To be sure to use parallelization
+        chunk_size=n_samples // 4,
+        strategy="parallel_on_Y",
+        return_distance=True,
+    )
+
+    ASSERT_RESULT[PairwiseDistancesReduction](
+        dist_par_X,
+        dist_par_Y,
+        indices_par_X,
+        indices_par_Y,
+    )
+
+
+@fails_if_unstable_openblas
+@pytest.mark.parametrize("n_features", [50, 500])
+@pytest.mark.parametrize("translation", [10 ** i for i in [4, 8]])
+@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
+@pytest.mark.parametrize(
+    "PairwiseDistancesReduction",
+    [PairwiseDistancesArgKmin],
+)
+def test_euclidean_translation_invariance(
+    n_features,
+    translation,
+    metric,
+    PairwiseDistancesReduction,
+    n_samples=1000,
+    dtype=np.float64,
+):
+    # The reduction must be translation invariant.
+    parameter = (
+        10
+        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
+        # Scaling the radius slightly with the numbers of dimensions
+        else 10 ** np.log(n_features)
+    )
+
+    rng = np.random.RandomState(0)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    # Haversine distance only accepts 2D data
+    if metric == "haversine":
+        X = np.ascontiguousarray(X[:, :2])
+        Y = np.ascontiguousarray(Y[:, :2])
+
+    reference_dist, reference_indices = PairwiseDistancesReduction.compute(
+        X,
+        Y,
+        parameter,
+        metric=metric,
+        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
+        return_distance=True,
+    )
+
+    dist, indices = PairwiseDistancesReduction.compute(
+        X + 0,
+        Y + 0,
+        parameter,
+        metric=metric,
+        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
+        return_distance=True,
+    )
+
+    ASSERT_RESULT[PairwiseDistancesReduction](
+        reference_dist, dist, reference_indices, indices
+    )
+
+
+@pytest.mark.parametrize("seed", range(10))
+@pytest.mark.parametrize("n_samples", [100, 1000])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("num_threads", [1, 2, 8])
+def test_sqeuclidean_row_norms(
+    seed,
+    n_samples,
+    n_features,
+    num_threads,
+    dtype=np.float64,
+):
+    rng = np.random.RandomState(seed)
+    spread = 100
+    X = rng.rand(n_samples, n_features).astype(dtype) * spread
+
+    sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
+    sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads))
+
+    assert_allclose(sq_row_norm_reference, sq_row_norm)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 3d8a1ca87d210..4b2261ad7c2f4 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -26,7 +26,7 @@
 from . import _joblib
 from ..exceptions import DataConversionWarning
 from .deprecation import deprecated
-from .fixes import np_version, parse_version
+from .fixes import np_version, parse_version, threadpool_info
 from ._estimator_html_repr import estimator_html_repr
 from .validation import (
     as_float_array,
@@ -81,6 +81,39 @@
 _IS_32BIT = 8 * struct.calcsize("P") == 32
 
 
+def _in_unstable_openblas_configuration():
+    """Return True if in an unstable configuration for OpenBLAS"""
+
+    # Import libraries which might load OpenBLAS.
+    import numpy  # noqa
+    import scipy  # noqa
+
+    modules_info = threadpool_info()
+
+    open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
+    if not open_blas_used:
+        return False
+
+    # OpenBLAS 0.3.16 fixed unstability for arm64, see:
+    # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa
+    openblas_arm64_stable_version = parse_version("0.3.16")
+    for info in modules_info:
+        if info["internal_api"] != "openblas":
+            continue
+        openblas_version = info.get("version")
+        openblas_architecture = info.get("architecture")
+        if openblas_version is None or openblas_architecture is None:
+            # Cannot be sure that OpenBLAS is good enough. Assume unstable:
+            return True
+        if (
+            openblas_architecture == "neoversen1"
+            and parse_version(openblas_version) < openblas_arm64_stable_version
+        ):
+            # See discussions in https://github.com/numpy/numpy/issues/19411
+            return True
+    return False
+
+
 class Bunch(dict):
     """Container object exposing keys as attributes.
 
diff --git a/sklearn/utils/_openmp_helpers.pxd b/sklearn/utils/_openmp_helpers.pxd
new file mode 100644
index 0000000000000..e57fc9bfa6bf5
--- /dev/null
+++ b/sklearn/utils/_openmp_helpers.pxd
@@ -0,0 +1,6 @@
+# Helpers to access OpenMP threads information
+#
+# Those interfaces act as indirections which allows the non-support of OpenMP
+# for implementations which have been written for it.
+
+cdef int _openmp_thread_num() nogil
diff --git a/sklearn/utils/_openmp_helpers.pyx b/sklearn/utils/_openmp_helpers.pyx
index fb8920074a84e..cddd77ac42746 100644
--- a/sklearn/utils/_openmp_helpers.pyx
+++ b/sklearn/utils/_openmp_helpers.pyx
@@ -6,7 +6,7 @@ IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
 
 def _openmp_parallelism_enabled():
     """Determines whether scikit-learn has been built with OpenMP
-    
+
     It allows to retrieve at runtime the information gathered at compile time.
     """
     # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time during
@@ -22,7 +22,7 @@ cpdef _openmp_effective_n_threads(n_threads=None):
       - if the ``OMP_NUM_THREADS`` environment variable is set, return
         ``openmp.omp_get_max_threads()``
       - otherwise, return the minimum between ``openmp.omp_get_max_threads()``
-        and the number of cpus, taking cgroups quotas into account. Cgroups 
+        and the number of cpus, taking cgroups quotas into account. Cgroups
         quotas can typically be set by tools such as Docker.
       The result of ``omp_get_max_threads`` can be influenced by environment
       variable ``OMP_NUM_THREADS`` or at runtime by ``omp_set_num_threads``.
@@ -59,4 +59,13 @@ cpdef _openmp_effective_n_threads(n_threads=None):
         # OpenMP disabled at build-time => sequential mode
         return 1
 
-    
+
+cdef inline int _openmp_thread_num() nogil:
+    """Return the number of the thread calling this function.
+
+    If scikit-learn is built without OpenMP support, always return 0.
+    """
+    IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
+        return openmp.omp_get_thread_num()
+    ELSE:
+        return 0
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 1724063be2f43..6f58ce3f3b7b4 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -48,7 +48,12 @@
 import joblib
 
 import sklearn
-from sklearn.utils import IS_PYPY, _IS_32BIT, deprecated
+from sklearn.utils import (
+    IS_PYPY,
+    _IS_32BIT,
+    deprecated,
+    _in_unstable_openblas_configuration,
+)
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.utils.validation import (
     check_array,
@@ -448,6 +453,10 @@ def set_random_state(estimator, random_state=0):
         os.environ.get("TRAVIS") == "true", reason="skip on travis"
     )
     fails_if_pypy = pytest.mark.xfail(IS_PYPY, reason="not compatible with PyPy")
+    fails_if_unstable_openblas = pytest.mark.xfail(
+        _in_unstable_openblas_configuration(),
+        reason="OpenBLAS is unstable for this configuration",
+    )
     skip_if_no_parallel = pytest.mark.skipif(
         not joblib.parallel.mp, reason="joblib is in serial mode"
     )

From 14106c484e856c0e7466455d295c9229d952a5c8 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Dec 2021 10:48:38 +0100
Subject: [PATCH 02/22] Retrigger CI for failing Circle CI job


From 3cdd3a5ad099326b1bd2c8601d7360272c10f7e4 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Dec 2021 11:15:45 +0100
Subject: [PATCH 03/22] TST Improve _get_dummy_metric_params_list

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../test_pairwise_distances_reduction.py      | 71 ++++++++++---------
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index c6efeb8259a20..89d012ac148ee 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pytest
-from collections import defaultdict
 from numpy.testing import assert_array_equal, assert_allclose
 from scipy.sparse import csr_matrix
 
@@ -18,42 +17,44 @@
 def _get_dummy_metric_params_list(metric: str, n_features: int):
     """Return list of dummy DistanceMetric kwargs for tests."""
 
+    # Distinguishing on cases not to compute unneeded datastructures.
     rng = np.random.RandomState(1)
-    weights = rng.random_sample(n_features)
-    weights /= weights.sum()
-
-    V = rng.random_sample((n_features, n_features))
-
-    # VI is positive-semidefinite, preferred for precision matrix
-    VI = np.dot(V, V.T) + 3 * np.eye(n_features)
-
-    METRICS_PARAMS = defaultdict(
-        list,
-        {
-            "euclidean": [{}],
-            "manhattan": [{}],
-            "minkowski": [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)],
-            "chebyshev": [{}],
-            "seuclidean": [dict(V=rng.rand(n_features))],
-            "haversine": [{}],
-            "wminkowski": [dict(p=1.5, w=weights)],
-            "mahalanobis": [dict(VI=VI)],
-        },
-    )
-
-    wminkowski_kwargs = dict(p=3, w=rng.rand(n_features))
-
-    if sp_version < parse_version("1.8.0.dev0"):
-        # TODO: remove once we no longer support scipy < 1.8.0.
-        # wminkowski was removed in scipy 1.8.0 but should work for previous
-        # versions.
-        METRICS_PARAMS["wminkowski"].append(wminkowski_kwargs)  # type: ignore
-    else:
-        # Recent scipy versions accept weights in the Minkowski metric directly:
-        # type: ignore
-        METRICS_PARAMS["minkowski"].append(wminkowski_kwargs)  # type: ignore
 
-    return METRICS_PARAMS.get(metric, [{}])
+    if metric == "minkowski":
+        minkowski_kwargs = [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)]
+        if sp_version >= parse_version("1.8.0.dev0"):
+            # TODO: remove the test once we no longer support scipy < 1.8.0.
+            # Recent scipy versions accept weights in the Minkowski metric directly:
+            # type: ignore
+            minkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
+
+        return minkowski_kwargs
+
+    # TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0.
+    if metric == "wminkowski":
+        weights = rng.random_sample(n_features)
+        weights /= weights.sum()
+        wminkowski_kwargs = [dict(p=1.5, w=weights)]
+        if sp_version < parse_version("1.8.0.dev0"):
+            # wminkowski was removed in scipy 1.8.0 but should work for previous
+            # versions.
+            wminkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
+        return wminkowski_kwargs
+
+    if metric == "seuclidean":
+        return [dict(V=rng.rand(n_features))]
+
+    if metric == "mahalanobis":
+        V = rng.random_sample((n_features, n_features))
+        # This makes VI is positive-semidefinite, which is a
+        # necessary condition to get nonsingular precision matrix.
+        VI = np.dot(V, V.T) + 3 * np.eye(n_features)
+
+        return [dict(VI=VI)]
+
+    # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric.
+    # In those cases, no kwargs is needed.
+    return [{}]
 
 
 def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices):

From 31db785b792a43b0064fc51fbb43c7709c4310bb Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Dec 2021 13:10:25 +0100
Subject: [PATCH 04/22] Address review comments

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 sklearn/metrics/_dist_metrics.pyx | 26 ++++++++++++++------------
 sklearn/metrics/setup.py          |  1 +
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 3def08da7965c..6d090bdebafe5 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1184,7 +1184,8 @@ cdef class DatasetsPair:
     aggregation logic from metric-specific computation as much as
     possible.
 
-    X and Y can be stored as np.ndarrays or CSR matrices in subclasses.
+    X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
+    in subclasses.
 
     This class avoids the overhead of dispatching distance computations
     to :class:`sklearn.metrics.DistanceMetric` based on the physical
@@ -1240,7 +1241,7 @@ cdef class DatasetsPair:
                 **(metric_kwargs or {})
             )
 
-        if X.dtype != np.float64 or Y.dtype != np.float64:
+        if not(X.dtype == Y.dtype == np.float64):
             raise ValueError("Only 64bit float datasets are supported for X and Y.")
 
         # Metric-specific checks that do not replace nor duplicate `check_array`.
@@ -1252,34 +1253,35 @@ cdef class DatasetsPair:
 
         return DenseDenseDatasetsPair(X, Y, distance_metric)
 
-    @classmethod
-    def unpack_csr_matrix(cls, X: csr_matrix):
-        """Ensure getting ITYPE instead of int internally used for CSR matrices."""
-        X_data = np.asarray(X.data, dtype=DTYPE)
-        X_indices = np.asarray(X.indices, dtype=ITYPE)
-        X_indptr = np.asarray(X.indptr, dtype=ITYPE)
-        return X_data, X_indptr, X_indptr
-
     def __init__(self, DistanceMetric distance_metric):
         self.distance_metric = distance_metric
 
     cdef ITYPE_t n_samples_X(self) nogil:
         """Number of samples in X."""
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
         return -999
 
     cdef ITYPE_t n_samples_Y(self) nogil:
         """Number of samples in Y."""
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
         return -999
 
     cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
         return self.dist(i, j)
 
     cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
+        # This is a abstract method.
+        # This _must_ always be overwritten in subclasses.
+        # TODO: add "with gil: raise" here when supporting Cython 3.0
         return -1
 
 @final
 cdef class DenseDenseDatasetsPair(DatasetsPair):
-    """Compute distances between vectors of two arrays.
+    """Compute distances between row vectors of two arrays.
 
     Parameters
     ----------
@@ -1291,7 +1293,7 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
 
     distance_metric: DistanceMetric
         The distance metric responsible for computing distances
-        between two vectors of (X, Y).
+        between two row vectors of (X, Y).
     """
 
     def __init__(self, X, Y, DistanceMetric distance_metric):
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 2bf582506922e..1c26d9969397c 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -29,6 +29,7 @@ def configuration(parent_package="", top_path=None):
     config.add_extension(
         "_pairwise_distances_reduction",
         sources=["_pairwise_distances_reduction.pyx"],
+        include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")],
         libraries=libraries,
     )
 

From b60e8977dab871fad0c2054c43e9e52039c6a5c0 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Dec 2021 17:33:25 +0100
Subject: [PATCH 05/22] Address review comments

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 41 ++++++++++---------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index d08b81d48a58c..40f3973ad225a 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -4,8 +4,8 @@
 #    Author: Julien Jerphanion <git@jjerphan.xyz>
 #
 #
-# The routines defined here are used in various algorithms performing
-# the same structure of operations on distances between vectors
+# The abstractions defined here are used in various algorithms performing
+# the same structure of operations on distances between row vectors
 # of a datasets pair (X, Y).
 #
 # Importantly, the core of the computation is chunked to make sure that the pairwise
@@ -37,7 +37,7 @@ from ..utils._cython_blas cimport (
 )
 from ..utils._heap cimport simultaneous_sort, heap_push
 from ..utils._openmp_helpers cimport _openmp_thread_num
-from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t
+from ..utils._typedefs cimport ITYPE_t, DTYPE_t
 
 from numbers import Integral, Real
 from typing import List
@@ -48,8 +48,10 @@ from ..utils.fixes import threadpool_limits
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
 
+
 np.import_array()
 
+
 cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
     const DTYPE_t[:, ::1] X,
     ITYPE_t num_threads,
@@ -62,26 +64,27 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
         # Casting for X to remove the const qualifier is needed because APIs
         # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
         # const qualifier.
+        # See: https://github.com/scipy/scipy/issues/1426
         DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
         ITYPE_t idx = 0
         ITYPE_t n = X.shape[0]
         ITYPE_t d = X.shape[1]
-        DTYPE_t[::1] row_norms = np.empty(n, dtype=DTYPE)
+        DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
 
     for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
-        row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
+        squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
 
-    return row_norms
+    return squared_row_norms
 
 cdef class PairwiseDistancesReduction:
-    """Abstract base class for pairwise distance computation & reduction
+    """Abstract base class for pairwise distance computation & reduction.
 
     Subclasses of this class compute pairwise distances between a set of
-    vectors (rows) X and another set of vectors (rows) Y and apply a
-    reduction on top. The reduction takes a matrix of pairwise distances
-    between rows of X and Y as input and outputs an aggregate data-structure
-    for each row of X. The aggregate values are typically smaller than the number
-    of rows in Y, hence the term reduction.
+    row vectors of X and another set of row vectors pf Y and apply a reduction on top.
+    The reduction takes a matrix of pairwise distances between rows of X and Y
+    as input and outputs an aggregate data-structure for each row of X.
+    The aggregate values are typically smaller than the number of rows in Y,
+    hence the term reduction.
 
     For computational reasons, it is interesting to perform the reduction on
     the fly on chunks of rows of X and Y so as to keep intermediate
@@ -104,7 +107,7 @@ cdef class PairwiseDistancesReduction:
     datasets_pair: DatasetsPair
         The pair of dataset to use.
 
-    chunk_size: int, default=None,
+    chunk_size: int, default=None
         The number of vectors per chunk. If None (default) looks-up in
         scikit-learn configuration for `pairwise_dist_chunk_size`,
         and use 256 if it is not set.
@@ -176,7 +179,7 @@ cdef class PairwiseDistancesReduction:
             "hamming",
             *BOOL_METRICS,
         }
-        return sorted(set(METRIC_MAPPING.keys()).difference(excluded))
+        return sorted(set(METRIC_MAPPING.keys()) - excluded)
 
     @classmethod
     def is_usable_for(cls, X, Y, metric) -> bool:
@@ -266,7 +269,7 @@ cdef class PairwiseDistancesReduction:
 
     @final
     cdef void _parallel_on_X(self) nogil:
-        """Compute the pairwise distances of each vector (row) of X on Y
+        """Compute the pairwise distances of each row vector of X on Y
         by parallelizing computation on chunks of X and reduce them.
 
         This strategy dispatches chunks of X uniformly on threads.
@@ -326,7 +329,7 @@ cdef class PairwiseDistancesReduction:
 
     @final
     cdef void _parallel_on_Y(self) nogil:
-        """Compute the pairwise distances of each vector (row) of X on Y
+        """Compute the pairwise distances of each row vector of X on Y
         by parallelizing computation on chunks of Y and reduce them.
 
         This strategy dispatches chunks of Y uniformly on threads.
@@ -399,8 +402,8 @@ cdef class PairwiseDistancesReduction:
     ) nogil:
         """Compute the pairwise distances on two chunks of X and Y and reduce them.
 
-        This is the core critical region of PairwiseDistanceReductions' computations
-        which must be implemented in subclasses.
+        This is THE core computational method of PairwiseDistanceReductions.
+        This must be implemented in subclasses.
         """
         return
 
@@ -476,7 +479,7 @@ cdef class PairwiseDistancesReduction:
         return
 
 cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
-    """Compute the argkmin of vectors (rows) of X on the ones of Y.
+    """Compute the argkmin of row vectors of X on the ones of Y.
 
     Parameters
     ----------

From 5d7ea09a96008732d6250915344374c992e4a7c7 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Dec 2021 17:07:10 +0100
Subject: [PATCH 06/22] DEBUG TST Try removing handling of unstable OpenBLAS
 configuration

This was introduced once a long time ago for a failure
which was happening in a single configuration (see the comments).

Let's see if this has been fixed.

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx |  2 +-
 .../test_pairwise_distances_reduction.py      | 10 ------
 sklearn/utils/__init__.py                     | 35 +------------------
 sklearn/utils/_testing.py                     | 11 +-----
 4 files changed, 3 insertions(+), 55 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 40f3973ad225a..b2bee50174d1d 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -43,7 +43,7 @@ from numbers import Integral, Real
 from typing import List
 from scipy.sparse import issparse
 from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
-from ..utils import check_scalar, _in_unstable_openblas_configuration
+from ..utils import check_scalar
 from ..utils.fixes import threadpool_limits
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._typedefs import ITYPE, DTYPE
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 89d012ac148ee..710ae9636494b 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -9,9 +9,7 @@
     _sqeuclidean_row_norms,
 )
 
-from sklearn.utils import _in_unstable_openblas_configuration
 from sklearn.utils.fixes import sp_version, parse_version
-from sklearn.utils._testing import fails_if_unstable_openblas
 
 
 def _get_dummy_metric_params_list(metric: str, n_features: int):
@@ -140,7 +138,6 @@ def test_argkmin_factory_method_wrong_usages():
         )
 
 
-@fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
@@ -187,7 +184,6 @@ def test_chunk_size_agnosticism(
     ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
 
 
-@fails_if_unstable_openblas
 @pytest.mark.parametrize("seed", range(5))
 @pytest.mark.parametrize("n_samples", [100, 1000])
 @pytest.mark.parametrize("chunk_size", [50, 512, 1024])
@@ -245,11 +241,6 @@ def test_strategies_consistency(
     n_features=10,
     dtype=np.float64,
 ):
-    # Results obtained using both parallelization strategies must be identical
-    if _in_unstable_openblas_configuration() and metric in ("sqeuclidean", "euclidean"):
-        pytest.xfail(
-            "OpenBLAS (used for '(sq)euclidean') is unstable in this configuration"
-        )
 
     rng = np.random.RandomState(seed)
     spread = 100
@@ -302,7 +293,6 @@ def test_strategies_consistency(
     )
 
 
-@fails_if_unstable_openblas
 @pytest.mark.parametrize("n_features", [50, 500])
 @pytest.mark.parametrize("translation", [10 ** i for i in [4, 8]])
 @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 4b2261ad7c2f4..3d8a1ca87d210 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -26,7 +26,7 @@
 from . import _joblib
 from ..exceptions import DataConversionWarning
 from .deprecation import deprecated
-from .fixes import np_version, parse_version, threadpool_info
+from .fixes import np_version, parse_version
 from ._estimator_html_repr import estimator_html_repr
 from .validation import (
     as_float_array,
@@ -81,39 +81,6 @@
 _IS_32BIT = 8 * struct.calcsize("P") == 32
 
 
-def _in_unstable_openblas_configuration():
-    """Return True if in an unstable configuration for OpenBLAS"""
-
-    # Import libraries which might load OpenBLAS.
-    import numpy  # noqa
-    import scipy  # noqa
-
-    modules_info = threadpool_info()
-
-    open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
-    if not open_blas_used:
-        return False
-
-    # OpenBLAS 0.3.16 fixed unstability for arm64, see:
-    # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa
-    openblas_arm64_stable_version = parse_version("0.3.16")
-    for info in modules_info:
-        if info["internal_api"] != "openblas":
-            continue
-        openblas_version = info.get("version")
-        openblas_architecture = info.get("architecture")
-        if openblas_version is None or openblas_architecture is None:
-            # Cannot be sure that OpenBLAS is good enough. Assume unstable:
-            return True
-        if (
-            openblas_architecture == "neoversen1"
-            and parse_version(openblas_version) < openblas_arm64_stable_version
-        ):
-            # See discussions in https://github.com/numpy/numpy/issues/19411
-            return True
-    return False
-
-
 class Bunch(dict):
     """Container object exposing keys as attributes.
 
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 6f58ce3f3b7b4..1724063be2f43 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -48,12 +48,7 @@
 import joblib
 
 import sklearn
-from sklearn.utils import (
-    IS_PYPY,
-    _IS_32BIT,
-    deprecated,
-    _in_unstable_openblas_configuration,
-)
+from sklearn.utils import IS_PYPY, _IS_32BIT, deprecated
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.utils.validation import (
     check_array,
@@ -453,10 +448,6 @@ def set_random_state(estimator, random_state=0):
         os.environ.get("TRAVIS") == "true", reason="skip on travis"
     )
     fails_if_pypy = pytest.mark.xfail(IS_PYPY, reason="not compatible with PyPy")
-    fails_if_unstable_openblas = pytest.mark.xfail(
-        _in_unstable_openblas_configuration(),
-        reason="OpenBLAS is unstable for this configuration",
-    )
     skip_if_no_parallel = pytest.mark.skipif(
         not joblib.parallel.mp, reason="joblib is in serial mode"
     )

From fb927e74bc8ac328cbfde3cbf75f2985ca837569 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 23 Dec 2021 18:28:42 +0100
Subject: [PATCH 07/22] TST Remove useless mahalanobis case

---
 .../metrics/tests/test_pairwise_distances_reduction.py    | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 710ae9636494b..844f8e7138d20 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -42,14 +42,6 @@ def _get_dummy_metric_params_list(metric: str, n_features: int):
     if metric == "seuclidean":
         return [dict(V=rng.rand(n_features))]
 
-    if metric == "mahalanobis":
-        V = rng.random_sample((n_features, n_features))
-        # This makes VI is positive-semidefinite, which is a
-        # necessary condition to get nonsingular precision matrix.
-        VI = np.dot(V, V.T) + 3 * np.eye(n_features)
-
-        return [dict(VI=VI)]
-
     # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric.
     # In those cases, no kwargs is needed.
     return [{}]

From a2f7b6d11597388e95443325d692037ef6eb321c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 3 Jan 2022 10:00:36 +0100
Subject: [PATCH 08/22] Factor the logic for computing last chunks indices

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 41 +++++++++++--------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index b2bee50174d1d..f2d6715e81911 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -162,8 +162,8 @@ cdef class PairwiseDistancesReduction:
 
         ITYPE_t n_samples_chunk, chunk_size
 
-        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder
-        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder
+        ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk
+        ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk
 
         bint execute_in_parallel_on_Y
 
@@ -233,16 +233,24 @@ cdef class PairwiseDistancesReduction:
         self.n_samples_X = datasets_pair.n_samples_X()
         self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size)
         X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk
-        self.X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
+        X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk
+        self.X_n_chunks = X_n_full_chunks + (X_n_samples_remainder != 0)
+
+        if X_n_samples_remainder != 0:
+            self.X_n_samples_last_chunk = X_n_samples_remainder
+        else:
+            self.X_n_samples_last_chunk = self.X_n_samples_chunk
 
         self.n_samples_Y = datasets_pair.n_samples_Y()
         self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size)
         Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk
-        self.Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
+        Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk
+        self.Y_n_chunks = Y_n_full_chunks + (Y_n_samples_remainder != 0)
 
-        # Counting remainder chunk in total number of chunks
-        self.X_n_chunks = X_n_full_chunks + (self.X_n_samples_remainder != 0)
-        self.Y_n_chunks = Y_n_full_chunks + (self.Y_n_samples_remainder != 0)
+        if Y_n_samples_remainder != 0:
+            self.Y_n_samples_last_chunk = Y_n_samples_remainder
+        else:
+            self.Y_n_samples_last_chunk = self.Y_n_samples_chunk
 
         if strategy is None:
             strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto')
@@ -293,9 +301,8 @@ cdef class PairwiseDistancesReduction:
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 X_start = X_chunk_idx * self.X_n_samples_chunk
-                if (X_chunk_idx == self.X_n_chunks - 1
-                    and self.X_n_samples_remainder > 0):
-                    X_end = X_start + self.X_n_samples_remainder
+                if X_chunk_idx == self.X_n_chunks - 1:
+                    X_end = X_start + self.X_n_samples_last_chunk
                 else:
                     X_end = X_start + self.X_n_samples_chunk
 
@@ -304,9 +311,8 @@ cdef class PairwiseDistancesReduction:
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
-                    if (Y_chunk_idx == self.Y_n_chunks - 1
-                        and self.Y_n_samples_remainder > 0):
-                        Y_end = Y_start + self.Y_n_samples_remainder
+                    if Y_chunk_idx == self.Y_n_chunks - 1:
+                        Y_end = Y_start + self.Y_n_samples_last_chunk
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
@@ -351,8 +357,8 @@ cdef class PairwiseDistancesReduction:
 
         for X_chunk_idx in range(self.X_n_chunks):
             X_start = X_chunk_idx * self.X_n_samples_chunk
-            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_remainder > 0:
-                X_end = X_start + self.X_n_samples_remainder
+            if X_chunk_idx == self.X_n_chunks - 1:
+                X_end = X_start + self.X_n_samples_last_chunk
             else:
                 X_end = X_start + self.X_n_samples_chunk
 
@@ -364,9 +370,8 @@ cdef class PairwiseDistancesReduction:
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
-                    if Y_chunk_idx == self.Y_n_chunks - 1 \
-                            and self.Y_n_samples_remainder > 0:
-                        Y_end = Y_start + self.Y_n_samples_remainder
+                    if Y_chunk_idx == self.Y_n_chunks - 1:
+                        Y_end = Y_start + self.Y_n_samples_last_chunk
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 

From e9acef73941b9d42ac999b17e73a6f9d1135c762 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Mon, 3 Jan 2022 15:55:27 +0100
Subject: [PATCH 09/22] Improve comments regarding strategies and parallel
 sections

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../metrics/_pairwise_distances_reduction.pyx | 51 ++++++++++++-------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index f2d6715e81911..86dee15dd9cc7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -123,19 +123,27 @@ cdef class PairwiseDistancesReduction:
     strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
         The chunking strategy defining which dataset parallelization are made on.
 
-        Strategies differs on the dispatching they use for chunks on threads:
+        For both strategies the computations happens with two nested loops,
+        respectively on chunks of X and chunks of Y.
+        Strategies differs on which loop (outer or inner) is made to run
+        in parallel with the Cython `prange` construct:
 
           - 'parallel_on_X' dispatches chunks of X uniformly on threads.
           Each thread then iterates on all the chunks of Y. This strategy is
           embarrassingly parallel and comes with no datastructures synchronisation.
 
           - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
-          Each thread then iterates on all the chunks of X. This strategy is
-          embarrassingly parallel but uses intermediate datastructures
-          synchronisation.
+          Each thread processes all the chunks of X in turn. This strategy is
+          a sequence of embarrassingly parallel subtasks (the inner loop on Y
+          chunks) with intermediate datastructures synchronisation at each
+          iteration of the sequential outer loop on X chunks.
 
           - 'auto' relies on a simple heuristic to choose between
-          'parallel_on_X' and 'parallel_on_Y'.
+          'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+          'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]`
+          is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity
+          for parallelism and is therefore more efficient despite the synchronization
+          step at each iteration of the outer loop on chunks of `X`.
 
           - None (default) looks-up in scikit-learn configuration for
           `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
@@ -278,7 +286,8 @@ cdef class PairwiseDistancesReduction:
     @final
     cdef void _parallel_on_X(self) nogil:
         """Compute the pairwise distances of each row vector of X on Y
-        by parallelizing computation on chunks of X and reduce them.
+        by parallelizing computation on the outer loop on chunks of X
+        and reduce them.
 
         This strategy dispatches chunks of X uniformly on threads.
         Each thread then iterates on all the chunks of Y. This strategy is
@@ -336,7 +345,8 @@ cdef class PairwiseDistancesReduction:
     @final
     cdef void _parallel_on_Y(self) nogil:
         """Compute the pairwise distances of each row vector of X on Y
-        by parallelizing computation on chunks of Y and reduce them.
+        by parallelizing computation on the inner loop on chunks of Y
+        and reduce them.
 
         This strategy dispatches chunks of Y uniformly on threads.
         Each thread then iterates on all the chunks of X. This strategy is
@@ -352,7 +362,7 @@ cdef class PairwiseDistancesReduction:
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
             ITYPE_t thread_num
 
-        # Allocating datastructures
+        # Allocating datastructures shared by all threads
         self._parallel_on_Y_parallel_init()
 
         for X_chunk_idx in range(self.X_n_chunks):
@@ -659,7 +669,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             sizeof(ITYPE_t *) * self.chunks_n_threads
         )
 
-        # Main heaps used by PairwiseDistancesArgKmin._compute to return results.
+        # Main heaps which will be returned as results by `PairwiseDistancesArgKmin.compute`.
         self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
 
@@ -685,8 +695,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
-        # Pushing the distance and their associated indices on heaps
-        # which keep tracks of the argkmin.
+        # Pushing the distances and their associated indices on a heap
+        # which by construction will keep track of the argkmin.
         for i in range(n_samples_X):
             for j in range(n_samples_Y):
                 heap_push(
@@ -718,7 +728,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         cdef:
             ITYPE_t idx, jdx
 
-        # Sorting indices of the argkmin for each query vector of X
+        # Sorting the main heaps portion associated to `X[X_start:X_end]`
+        # in ascending order w.r.t the distances.
         for idx in range(X_end - X_start):
             simultaneous_sort(
                 self.heaps_r_distances_chunks[thread_num] + idx * self.k,
@@ -736,7 +747,9 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
         # The allocation is done in parallel for data locality purposes: this way
         # the heaps used in each threads are allocated in pages which are closer
-        # to processor core used by the thread.
+        # to the CPU core used by the thread.
+        # See comments about First Touch Placement Policy:
+        # https://www.openmp.org/wp-content/uploads/openmp-webinar-vanderPas-20210318.pdf #noqa
         for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True,
                                  num_threads=self.chunks_n_threads):
             # As chunks of X are shared across threads, so must their
@@ -770,9 +783,11 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         with nogil, parallel(num_threads=self.effective_n_threads):
             # Synchronising the thread heaps with the main heaps.
             # This is done in parallel sample-wise (no need for locks).
-            # This might break each thread's data locality a bit but
-            # but this is negligible and this parallel pattern has
-            # shown to be efficient in practice.
+            # 
+            # This might break each thread's data locality as each heap which
+            # was allocated in a thread is being now being used in several threads.
+            # 
+            # Still, this parallel pattern has shown to be efficient in practice.
             for idx in prange(X_end - X_start, schedule="static"):
                 for thread_num in range(self.chunks_n_threads):
                     for jdx in range(self.k):
@@ -796,8 +811,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 free(self.heaps_r_distances_chunks[thread_num])
                 free(self.heaps_indices_chunks[thread_num])
 
-            # Sort the main heaps into arrays in parallel
-            # in ascending order w.r.t the distances
+            # Sorting the main in ascending order w.r.t the distances.
+            # This is done in parallel sample-wise (no need for locks). 
             for idx in prange(self.n_samples_X, schedule='static'):
                 simultaneous_sort(
                     &self.argkmin_distances[idx, 0],

From 51dad2b2065d73fded8af7cb8d88265527564249 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 4 Jan 2022 10:53:26 +0100
Subject: [PATCH 10/22] Address reviews' comments

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_dist_metrics.pyx             | 14 +++++----
 .../metrics/_pairwise_distances_reduction.pyx | 30 +++++++++----------
 .../test_pairwise_distances_reduction.py      | 19 ++++++------
 3 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 6d090bdebafe5..6cf93baeca925 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1222,10 +1222,10 @@ cdef class DatasetsPair:
             If provided as a sparse matrix, it must be in CSR format.
 
         metric : str, default='euclidean'
-            The distance metric to use for argkmin. The default metric is
-            a fast implementation of the standard Euclidean metric.
-            For a list of available metrics, see the documentation of
-            :class:`~sklearn.metrics.DistanceMetric`.
+            The distance metric to compute between rows of X and Y.
+            The default metric is a fast implementation of the Euclidean
+            metric. For a list of available metrics, see the documentation
+            of :class:`~sklearn.metrics.DistanceMetric`.
 
         metric_kwargs : dict, default=None
             Keyword arguments to pass to specified metric function.
@@ -1242,12 +1242,16 @@ cdef class DatasetsPair:
             )
 
         if not(X.dtype == Y.dtype == np.float64):
-            raise ValueError("Only 64bit float datasets are supported for X and Y.")
+            raise ValueError(
+                f"Only 64bit float datasets are supported at this time, "
+                f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}"
+            )
 
         # Metric-specific checks that do not replace nor duplicate `check_array`.
         distance_metric._validate_data(X)
         distance_metric._validate_data(Y)
 
+        # TODO: dispatch to other dataset pairs for sparse support once available:
         if issparse(X) or issparse(Y):
             raise ValueError("Only dense datasets are supported for X and Y.")
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 86dee15dd9cc7..3ae51e0ee00ab 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -181,7 +181,7 @@ cdef class PairwiseDistancesReduction:
             "pyfunc",  # is relatively slow because we need to coerce data as np arrays
             "mahalanobis", # is numerically unstable
             # TODO: In order to support discrete distance metrics, we need to have a
-            # simultaneous sort which breaks ties on indices when distances are identical.
+            # stable simultaneous sort which preserves the order of the input.
             # The best might be using std::stable_sort and a Comparator taking an
             # Arrays of Structures instead of Structure of Arrays (currently used).
             "hamming",
@@ -210,13 +210,9 @@ cdef class PairwiseDistancesReduction:
         -------
         True if the PairwiseDistancesReduction can be used, else False.
         """
-        # Coercing to np.array to get the dtype
-        # TODO: what is the best way to get lists' dtype?
-        X = np.asarray(X) if not isinstance(X, (np.ndarray, scipy.sparse.spmatrix)) else X
-        Y = np.asarray(Y) if not isinstance(Y, (np.ndarray, scipy.sparse.spmatrix)) else Y
         # TODO: support sparse arrays and 32 bits
-        return (not issparse(X) and X.dtype == np.float64 and X.ndim == 2 and
-                not issparse(Y) and Y.dtype == np.float64 and Y.ndim == 2 and
+        return (not issparse(X) and X.dtype == np.float64 and
+                not issparse(Y) and Y.dtype == np.float64 and
                 metric in cls.valid_metrics())
 
     def __init__(
@@ -289,9 +285,11 @@ cdef class PairwiseDistancesReduction:
         by parallelizing computation on the outer loop on chunks of X
         and reduce them.
 
-        This strategy dispatches chunks of X uniformly on threads.
-        Each thread then iterates on all the chunks of Y. This strategy is
-        embarrassingly parallel and comes with no datastructures synchronisation.
+        This strategy dispatches chunks of Y uniformly on threads.
+        Each thread processes all the chunks of X in turn. This strategy is
+        a sequence of embarrassingly parallel subtasks (the inner loop on Y
+        chunks) with intermediate datastructures synchronisation at each
+        iteration of the sequential outer loop on X chunks.
 
         Private datastructures are modified internally by threads.
 
@@ -363,7 +361,7 @@ cdef class PairwiseDistancesReduction:
             ITYPE_t thread_num
 
         # Allocating datastructures shared by all threads
-        self._parallel_on_Y_parallel_init()
+        self._parallel_on_Y_init()
 
         for X_chunk_idx in range(self.X_n_chunks):
             X_start = X_chunk_idx * self.X_n_samples_chunk
@@ -376,7 +374,7 @@ cdef class PairwiseDistancesReduction:
                 thread_num = _openmp_thread_num()
 
                 # Initializing datastructures used in this thread
-                self._parallel_on_Y_init(thread_num)
+                self._parallel_on_Y_parallel_init(thread_num)
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -466,13 +464,13 @@ cdef class PairwiseDistancesReduction:
         """Interact with datastructures after executing all the reductions."""
         return
 
-    cdef void _parallel_on_Y_parallel_init(
+    cdef void _parallel_on_Y_init(
         self,
     ) nogil:
         """Allocate datastructures used in all threads."""
         return
 
-    cdef void _parallel_on_Y_init(
+    cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
@@ -737,7 +735,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 self.k
             )
 
-    cdef void _parallel_on_Y_parallel_init(
+    cdef void _parallel_on_Y_init(
         self,
     ) nogil:
         cdef:
@@ -763,7 +761,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             )
 
     @final
-    cdef void _parallel_on_Y_init(
+    cdef void _parallel_on_Y_parallel_init(
         self,
         ITYPE_t thread_num,
     ) nogil:
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 844f8e7138d20..104087b8cef9e 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -76,9 +76,6 @@ def test_pairwise_distances_reduction_is_usable_for():
         X.astype(np.int64), Y.astype(np.int64), metric
     )
 
-    assert not PairwiseDistancesReduction.is_usable_for(X[0], Y, metric)
-    assert not PairwiseDistancesReduction.is_usable_for(X, Y[0], metric)
-
     assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc")
     # TODO: remove once 32 bits datasets are supported
     assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric)
@@ -96,16 +93,20 @@ def test_argkmin_factory_method_wrong_usages():
     k = 5
     metric = "euclidean"
 
-    with pytest.raises(
-        ValueError, match="Only 64bit float datasets are supported for X and Y."
-    ):
+    msg = (
+        "Only 64bit float datasets are supported at this time, "
+        "got: X.dtype=float32 and Y.dtype=float64"
+    )
+    with pytest.raises(ValueError, match=msg):
         PairwiseDistancesArgKmin.compute(
             X=X.astype(np.float32), Y=Y, k=k, metric=metric
         )
 
-    with pytest.raises(
-        ValueError, match="Only 64bit float datasets are supported for X and Y."
-    ):
+    msg = (
+        "Only 64bit float datasets are supported at this time, "
+        "got: X.dtype=float64 and Y.dtype=int32"
+    )
+    with pytest.raises(ValueError, match=msg):
         PairwiseDistancesArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
 
     with pytest.raises(ValueError, match="k == -1, must be >= 1."):

From 59b153cf5c60ed31715e47976591c603bb2189b7 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 4 Jan 2022 10:54:08 +0100
Subject: [PATCH 11/22] Remove unused _sqeuclidean_row_norms

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 25 -------------------
 .../test_pairwise_distances_reduction.py      | 22 ----------------
 2 files changed, 47 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 3ae51e0ee00ab..967c62b46546d 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -51,31 +51,6 @@ from ..utils._typedefs import ITYPE, DTYPE
 
 np.import_array()
 
-
-cpdef DTYPE_t[::1] _sqeuclidean_row_norms(
-    const DTYPE_t[:, ::1] X,
-    ITYPE_t num_threads,
-):
-    """Compute the squared euclidean norm of the rows of X in parallel.
-
-    This is faster than using np.einsum("ij, ij->i") even when using a single thread.
-    """
-    cdef:
-        # Casting for X to remove the const qualifier is needed because APIs
-        # exposed via scipy.linalg.cython_blas aren't reflecting the arguments'
-        # const qualifier.
-        # See: https://github.com/scipy/scipy/issues/1426
-        DTYPE_t * X_ptr = <DTYPE_t *> &X[0, 0]
-        ITYPE_t idx = 0
-        ITYPE_t n = X.shape[0]
-        ITYPE_t d = X.shape[1]
-        DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE)
-
-    for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads):
-        squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1)
-
-    return squared_row_norms
-
 cdef class PairwiseDistancesReduction:
     """Abstract base class for pairwise distance computation & reduction.
 
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 104087b8cef9e..18aba2acd79c8 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -6,7 +6,6 @@
 from sklearn.metrics._pairwise_distances_reduction import (
     PairwiseDistancesReduction,
     PairwiseDistancesArgKmin,
-    _sqeuclidean_row_norms,
 )
 
 from sklearn.utils.fixes import sp_version, parse_version
@@ -340,24 +339,3 @@ def test_euclidean_translation_invariance(
     ASSERT_RESULT[PairwiseDistancesReduction](
         reference_dist, dist, reference_indices, indices
     )
-
-
-@pytest.mark.parametrize("seed", range(10))
-@pytest.mark.parametrize("n_samples", [100, 1000])
-@pytest.mark.parametrize("n_features", [5, 10, 100])
-@pytest.mark.parametrize("num_threads", [1, 2, 8])
-def test_sqeuclidean_row_norms(
-    seed,
-    n_samples,
-    n_features,
-    num_threads,
-    dtype=np.float64,
-):
-    rng = np.random.RandomState(seed)
-    spread = 100
-    X = rng.rand(n_samples, n_features).astype(dtype) * spread
-
-    sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
-    sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads))
-
-    assert_allclose(sq_row_norm_reference, sq_row_norm)

From 395f92a6ecec471e7f16bd127d01561fd7fb85f1 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 4 Jan 2022 11:06:39 +0100
Subject: [PATCH 12/22] Swap argkmin_indices and argkmin_distances

To have argkmin_indices always be the first
for consistency.

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../metrics/_pairwise_distances_reduction.pyx    | 13 ++++++++++---
 .../tests/test_pairwise_distances_reduction.py   | 16 ++++++++--------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 967c62b46546d..ce247971eabcd 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -575,8 +575,15 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
         Returns
         -------
-            Indices of argkmin for each vector in X and its associated distances
-            if return_distance=True.
+            If return_distance=False:
+              - argkmin_indices : ndarray of shape (n_samples_X, k)
+                Indices of the argkmin for each vector in X.
+
+            If return_distance=True:
+              - argkmin_indices : ndarray of shape (n_samples_X, k)
+                Indices of the argkmin for each vector in X.
+              - argkmin_distances : ndarray of shape (n_samples_X, k)
+                Distances to the argkmin for each vector in X.
 
         Notes
         -----
@@ -812,6 +819,6 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             # We need to recompute distances because we relied on
             # surrogate distances for the reduction.
             self.compute_exact_distances()
-            return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
+            return np.asarray(self.argkmin_indices), np.asarray(self.argkmin_distances)
 
         return np.asarray(self.argkmin_indices)
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 18aba2acd79c8..33c746e0ff15c 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -158,14 +158,14 @@ def test_chunk_size_agnosticism(
         else 10 ** np.log(n_features)
     )
 
-    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
+    ref_indices, ref_dist = PairwiseDistancesReduction.compute(
         X,
         Y,
         parameter,
         return_distance=True,
     )
 
-    dist, indices = PairwiseDistancesReduction.compute(
+    indices, dist = PairwiseDistancesReduction.compute(
         X,
         Y,
         parameter,
@@ -204,14 +204,14 @@ def test_n_threads_agnosticism(
         else 10 ** np.log(n_features)
     )
 
-    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
+    ref_indices, ref_dist = PairwiseDistancesReduction.compute(
         X,
         Y,
         parameter,
         return_distance=True,
     )
 
-    dist, indices = PairwiseDistancesReduction.compute(
+    indices, dist = PairwiseDistancesReduction.compute(
         X, Y, parameter, n_threads=1, return_distance=True
     )
 
@@ -251,7 +251,7 @@ def test_strategies_consistency(
         else 10 ** np.log(n_features)
     )
 
-    dist_par_X, indices_par_X = PairwiseDistancesReduction.compute(
+    indices_par_X, dist_par_X = PairwiseDistancesReduction.compute(
         X,
         Y,
         parameter,
@@ -264,7 +264,7 @@ def test_strategies_consistency(
         return_distance=True,
     )
 
-    dist_par_Y, indices_par_Y = PairwiseDistancesReduction.compute(
+    indices_par_Y, dist_par_Y = PairwiseDistancesReduction.compute(
         X,
         Y,
         parameter,
@@ -318,7 +318,7 @@ def test_euclidean_translation_invariance(
         X = np.ascontiguousarray(X[:, :2])
         Y = np.ascontiguousarray(Y[:, :2])
 
-    reference_dist, reference_indices = PairwiseDistancesReduction.compute(
+    reference_indices, reference_dist = PairwiseDistancesReduction.compute(
         X,
         Y,
         parameter,
@@ -327,7 +327,7 @@ def test_euclidean_translation_invariance(
         return_distance=True,
     )
 
-    dist, indices = PairwiseDistancesReduction.compute(
+    indices, dist = PairwiseDistancesReduction.compute(
         X + 0,
         Y + 0,
         parameter,

From 09a95272b4bd5b02692e8fde8ff26959f9814cae Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 4 Jan 2022 11:23:10 +0100
Subject: [PATCH 13/22] Move initializations from __init__ to __cinit__

This is more appropriate, especially for dynamic allocation.

See:
cython.readthedocs.io/en/latest/src/userguide/special_methods.html#initialisation-methods-cinit-and-init

    The __cinit__() method is where you should perform basic C-level
    initialisation of the object, including allocation of any C data
    structures that your object will own.

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../metrics/_pairwise_distances_reduction.pyx | 46 +++++++++++--------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index ce247971eabcd..fe59d141f8ff7 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -190,12 +190,14 @@ cdef class PairwiseDistancesReduction:
                 not issparse(Y) and Y.dtype == np.float64 and
                 metric in cls.valid_metrics())
 
-    def __init__(
+    def __cinit__(
         self,
         DatasetsPair datasets_pair,
         chunk_size=None,
         n_threads=None,
         strategy=None,
+        *args,
+        **kwargs,
      ):
         cdef:
             ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks
@@ -474,9 +476,6 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     datasets_pair: DatasetsPair
         The dataset pairs (X, Y) for the reduction.
 
-    k: int
-        The k for the argkmin reduction.
-
     chunk_size: int, default=None,
         The number of vectors per chunk. If None (default) looks-up in
         scikit-learn configuration for `pairwise_dist_chunk_size`,
@@ -485,10 +484,14 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     n_threads: int, default=None
         The number of OpenMP threads to use for the reduction.
         Parallelism is done on chunks and the sharding of chunks
-        depends on the `strategy` set on :method:`~ArgKmin.compute`.
+        depends on the `strategy` set on
+        :meth:`~PairwiseDistancesArgKmin.compute`.
 
         See _openmp_effective_n_threads, for details about
         the specification of n_threads.
+
+    k: int, default=1
+        The k for the argkmin reduction.
     """
 
     cdef:
@@ -544,7 +547,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             The number of OpenMP threads to use for the reduction.
             Parallelism is done on chunks and the sharding of chunks
             depends on the `strategy` set on
-            :method:`~PairwiseDistancesArgKmin.compute`.
+            :meth:`~PairwiseDistancesArgKmin.compute`.
 
             See _openmp_effective_n_threads, for details about
             the specification of n_threads.
@@ -621,25 +624,22 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
 
         return pda._finalize_results(return_distance)
 
-    def __init__(
+    def __cinit__(
         self,
         DatasetsPair datasets_pair,
-        ITYPE_t k,
         chunk_size=None,
         n_threads=None,
         strategy=None,
-    ):
-        super().__init__(datasets_pair, chunk_size, n_threads, strategy)
-
-        self.k = check_scalar(k, "k", Integral, min_val=1)
-
+        *args,
+        **kwargs,
+     ):
         # Allocating pointers to datastructures but not the datastructures themselves.
         # There are as many pointers as effective threads.
         #
         # For the sake of explicitness:
-        #   - when parallelizing on X, those heaps pointers are referencing
+        #   - when parallelizing on X, the pointers of those heaps are referencing
         #   (with proper offsets) addresses of the two main heaps (see bellow)
-        #   - when parallelizing on Y, those heaps pointer heaps are referencing
+        #   - when parallelizing on Y, the pointers of those heaps are referencing
         #   small heaps which are thread-wise-allocated and whose content will be
         #   merged with the main heaps'.
         self.heaps_r_distances_chunks = <DTYPE_t **> malloc(
@@ -649,6 +649,16 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
             sizeof(ITYPE_t *) * self.chunks_n_threads
         )
 
+    def __init__(
+        self,
+        DatasetsPair datasets_pair,
+        chunk_size=None,
+        n_threads=None,
+        strategy=None,
+        ITYPE_t k=1,
+    ):
+        self.k = check_scalar(k, "k", Integral, min_val=1)
+
         # Main heaps which will be returned as results by `PairwiseDistancesArgKmin.compute`.
         self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE)
         self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE)
@@ -763,10 +773,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         with nogil, parallel(num_threads=self.effective_n_threads):
             # Synchronising the thread heaps with the main heaps.
             # This is done in parallel sample-wise (no need for locks).
-            # 
+            #
             # This might break each thread's data locality as each heap which
             # was allocated in a thread is being now being used in several threads.
-            # 
+            #
             # Still, this parallel pattern has shown to be efficient in practice.
             for idx in prange(X_end - X_start, schedule="static"):
                 for thread_num in range(self.chunks_n_threads):
@@ -792,7 +802,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
                 free(self.heaps_indices_chunks[thread_num])
 
             # Sorting the main in ascending order w.r.t the distances.
-            # This is done in parallel sample-wise (no need for locks). 
+            # This is done in parallel sample-wise (no need for locks).
             for idx in prange(self.n_samples_X, schedule='static'):
                 simultaneous_sort(
                     &self.argkmin_distances[idx, 0],

From f396a585ea67b2d99effa0b5b5a168b2b7e61172 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 5 Jan 2022 08:30:44 +0100
Subject: [PATCH 14/22] Improve docstring comment

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index fe59d141f8ff7..35c9095c57534 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -324,9 +324,10 @@ cdef class PairwiseDistancesReduction:
         and reduce them.
 
         This strategy dispatches chunks of Y uniformly on threads.
-        Each thread then iterates on all the chunks of X. This strategy is
-        embarrassingly parallel but uses intermediate datastructures
-        synchronisation.
+        Each thread processes all the chunks of X in turn. This strategy is
+        a sequence of embarrassingly parallel subtasks (the inner loop on Y
+        chunks) with intermediate datastructures synchronisation at each
+        iteration of the sequential outer loop on X chunks.
 
         Private datastructures are modified internally by threads.
 

From 22f4f30e1f8c5d2bdd6af2333bde8da6ee548477 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 5 Jan 2022 09:04:40 +0100
Subject: [PATCH 15/22] Improve comments

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../metrics/_pairwise_distances_reduction.pyx | 40 ++++++++++---------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 35c9095c57534..099c39df77604 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -24,22 +24,12 @@ from libc.float cimport DBL_MAX
 from cython cimport final
 from cython.parallel cimport parallel, prange
 
-from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair
-from ..utils._cython_blas cimport (
-  BLAS_Order,
-  BLAS_Trans,
-  ColMajor,
-  NoTrans,
-  RowMajor,
-  Trans,
-  _dot,
-  _gemm,
-)
+from ._dist_metrics cimport DatasetsPair
 from ..utils._heap cimport simultaneous_sort, heap_push
 from ..utils._openmp_helpers cimport _openmp_thread_num
 from ..utils._typedefs cimport ITYPE_t, DTYPE_t
 
-from numbers import Integral, Real
+from numbers import Integral
 from typing import List
 from scipy.sparse import issparse
 from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING
@@ -472,6 +462,12 @@ cdef class PairwiseDistancesReduction:
 cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
     """Compute the argkmin of row vectors of X on the ones of Y.
 
+    For each row vector of X, computes the indices of k first the rows
+    vectors of Y with the smallest distances.
+
+    PairwiseDistancesArgKmin is typically used to perform
+    bruteforce k-nearest neighbors queries.
+
     Parameters
     ----------
     datasets_pair: DatasetsPair
@@ -556,19 +552,27 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction):
         strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
             The chunking strategy defining which dataset parallelization are made on.
 
-            Strategies differs on the dispatching they use for chunks on threads:
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
 
-              - 'parallel_on__X' dispatches chunks of X uniformly on threads.
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
               Each thread then iterates on all the chunks of Y. This strategy is
               embarrassingly parallel and comes with no datastructures synchronisation.
 
               - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
-              Each thread then iterates on all the chunks of X. This strategy is
-              embarrassingly parallel but uses intermediate datastructures
-              synchronisation.
+              Each thread processes all the chunks of X in turn. This strategy is
+              a sequence of embarrassingly parallel subtasks (the inner loop on Y
+              chunks) with intermediate datastructures synchronisation at each
+              iteration of the sequential outer loop on X chunks.
 
               - 'auto' relies on a simple heuristic to choose between
-              'parallel_on__X' and 'parallel_on_Y'.
+              'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+              'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]`
+              is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity
+              for parallelism and is therefore more efficient despite the synchronization
+              step at each iteration of the outer loop on chunks of `X`.
 
               - None (default) looks-up in scikit-learn configuration for
               `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.

From 234fb01151bb60732023b09dd8d3bdc3ed5ce863 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 5 Jan 2022 09:15:37 +0100
Subject: [PATCH 16/22] Add 'pairwise_dist_chunk_size' to scikit-learn config

---
 sklearn/_config.py           | 30 ++++++++++++++++++++++++++++--
 sklearn/tests/test_config.py |  3 +++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/sklearn/_config.py b/sklearn/_config.py
index c41c180012056..d6a02737f640d 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -9,6 +9,9 @@
     "working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)),
     "print_changed_only": True,
     "display": "text",
+    "pairwise_dist_chunk_size": int(
+        os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256)
+    ),
 }
 _threadlocal = threading.local()
 
@@ -40,7 +43,11 @@ def get_config():
 
 
 def set_config(
-    assume_finite=None, working_memory=None, print_changed_only=None, display=None
+    assume_finite=None,
+    working_memory=None,
+    print_changed_only=None,
+    display=None,
+    pairwise_dist_chunk_size=None,
 ):
     """Set global scikit-learn configuration
 
@@ -80,6 +87,12 @@ def set_config(
 
         .. versionadded:: 0.23
 
+    pairwise_dist_chunk_size : int, default=None
+        The number of vectors per chunk for PairwiseDistancesReduction.
+        Default is 256 (suitable for most of modern laptops' caches and architectures).
+
+        .. versionadded:: 1.1
+
     See Also
     --------
     config_context : Context manager for global scikit-learn configuration.
@@ -95,11 +108,18 @@ def set_config(
         local_config["print_changed_only"] = print_changed_only
     if display is not None:
         local_config["display"] = display
+    if pairwise_dist_chunk_size is not None:
+        local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size
 
 
 @contextmanager
 def config_context(
-    *, assume_finite=None, working_memory=None, print_changed_only=None, display=None
+    *,
+    assume_finite=None,
+    working_memory=None,
+    print_changed_only=None,
+    display=None,
+    pairwise_dist_chunk_size=None,
 ):
     """Context manager for global scikit-learn configuration.
 
@@ -138,6 +158,12 @@ def config_context(
 
         .. versionadded:: 0.23
 
+    pairwise_dist_chunk_size : int, default=None
+        The number of vectors per chunk for PairwiseDistancesReduction.
+        Default is 256 (suitable for most of modern laptops' caches and architectures).
+
+        .. versionadded:: 1.1
+
     Yields
     ------
     None.
diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py
index f78a9ff30b10a..e99eb5fc9db82 100644
--- a/sklearn/tests/test_config.py
+++ b/sklearn/tests/test_config.py
@@ -16,6 +16,7 @@ def test_config_context():
         "working_memory": 1024,
         "print_changed_only": True,
         "display": "text",
+        "pairwise_dist_chunk_size": 256,
     }
 
     # Not using as a context manager affects nothing
@@ -28,6 +29,7 @@ def test_config_context():
             "working_memory": 1024,
             "print_changed_only": True,
             "display": "text",
+            "pairwise_dist_chunk_size": 256,
         }
     assert get_config()["assume_finite"] is False
 
@@ -57,6 +59,7 @@ def test_config_context():
         "working_memory": 1024,
         "print_changed_only": True,
         "display": "text",
+        "pairwise_dist_chunk_size": 256,
     }
 
     # No positional arguments

From f89c65d4795e7d884eef27a1876d547a2b5b6f5b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 5 Jan 2022 10:03:34 +0100
Subject: [PATCH 17/22] TST Adapt test for PairwiseDistancesArgKmin translation
 invariance

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../test_pairwise_distances_reduction.py      | 89 ++++++++++++-------
 1 file changed, 56 insertions(+), 33 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 33c746e0ff15c..2982bc32b06d1 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -2,6 +2,7 @@
 import pytest
 from numpy.testing import assert_array_equal, assert_allclose
 from scipy.sparse import csr_matrix
+from scipy.spatial.distance import cdist
 
 from sklearn.metrics._pairwise_distances_reduction import (
     PairwiseDistancesReduction,
@@ -10,6 +11,22 @@
 
 from sklearn.utils.fixes import sp_version, parse_version
 
+# Common supported metric between scipy.spatial.distance.cdist
+# and PairwiseDistancesReduction.
+# This allows constructing tests to check consistency of results
+# of concrete PairwiseDistancesReduction on some metrics using APIs
+# from scipy and numpy.
+CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS = [
+    "braycurtis",
+    "canberra",
+    "chebyshev",
+    "cityblock",
+    "euclidean",
+    "minkowski",
+    "seuclidean",
+    "wminkowski",
+]
+
 
 def _get_dummy_metric_params_list(metric: str, n_features: int):
     """Return list of dummy DistanceMetric kwargs for tests."""
@@ -285,57 +302,63 @@ def test_strategies_consistency(
     )
 
 
+# Concrete PairwiseDistancesReductions tests
+
+
 @pytest.mark.parametrize("n_features", [50, 500])
-@pytest.mark.parametrize("translation", [10 ** i for i in [4, 8]])
-@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
+@pytest.mark.parametrize("translation", [10 ** i for i in [2, 4, 8]])
+@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
+@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
 @pytest.mark.parametrize(
     "PairwiseDistancesReduction",
     [PairwiseDistancesArgKmin],
 )
-def test_euclidean_translation_invariance(
+def test_argkmin_translation_invariance(
     n_features,
     translation,
     metric,
-    PairwiseDistancesReduction,
-    n_samples=1000,
+    strategy,
+    n_samples=100,
+    k=10,
     dtype=np.float64,
 ):
-    # The reduction must be translation invariant.
-    parameter = (
-        10
-        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
-        # Scaling the radius slightly with the numbers of dimensions
-        else 10 ** np.log(n_features)
-    )
+    # PairwiseDistancesArgKmin must be translation invariant.
 
     rng = np.random.RandomState(0)
-    spread = 100
-    X = rng.rand(n_samples, n_features).astype(dtype) * spread
-    Y = rng.rand(n_samples, n_features).astype(dtype) * spread
+    spread = 1000
+    X_translated = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y_translated = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
 
     # Haversine distance only accepts 2D data
     if metric == "haversine":
-        X = np.ascontiguousarray(X[:, :2])
-        Y = np.ascontiguousarray(Y[:, :2])
-
-    reference_indices, reference_dist = PairwiseDistancesReduction.compute(
-        X,
-        Y,
-        parameter,
-        metric=metric,
-        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
-        return_distance=True,
-    )
-
-    indices, dist = PairwiseDistancesReduction.compute(
-        X + 0,
-        Y + 0,
-        parameter,
+        X_translated = np.ascontiguousarray(X_translated[:, :2])
+        Y_translated = np.ascontiguousarray(Y_translated[:, :2])
+
+    metric_kwargs = _get_dummy_metric_params_list(metric, n_features)[0]
+
+    # Reference for argkmin results
+    dist_matrix = cdist(X_translated, Y_translated, metric=metric, **metric_kwargs)
+    # Taking argkmin (indices of the k smallest values)
+    argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
+    # Getting the associated distances
+    argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float)
+    for row_idx in range(argkmin_indices_ref.shape[0]):
+        argkmin_distances_ref[row_idx] = dist_matrix[
+            row_idx, argkmin_indices_ref[row_idx]
+        ]
+
+    argkmin_indices, argkmin_distances = PairwiseDistancesReduction.compute(
+        X_translated,
+        Y_translated,
+        k,
         metric=metric,
-        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
+        metric_kwargs=metric_kwargs,
         return_distance=True,
+        # So as to have more than a chunk, forcing parallelism.
+        chunk_size=n_samples // 4,
+        strategy=strategy,
     )
 
     ASSERT_RESULT[PairwiseDistancesReduction](
-        reference_dist, dist, reference_indices, indices
+        argkmin_distances, argkmin_distances_ref, argkmin_indices, argkmin_indices_ref
     )

From fe17af1dca9ddbd2cf2a18cf5d24b3c57001ad6c Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 5 Jan 2022 10:19:09 +0100
Subject: [PATCH 18/22] test_pairwise_distances_argkmin

---
 .../tests/test_pairwise_distances_reduction.py     | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 2982bc32b06d1..eec8838e2f20d 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -306,14 +306,10 @@ def test_strategies_consistency(
 
 
 @pytest.mark.parametrize("n_features", [50, 500])
-@pytest.mark.parametrize("translation", [10 ** i for i in [2, 4, 8]])
+@pytest.mark.parametrize("translation", [0, 1e8])
 @pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
 @pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
-@pytest.mark.parametrize(
-    "PairwiseDistancesReduction",
-    [PairwiseDistancesArgKmin],
-)
-def test_argkmin_translation_invariance(
+def test_pairwise_distances_argkmin(
     n_features,
     translation,
     metric,
@@ -322,8 +318,6 @@ def test_argkmin_translation_invariance(
     k=10,
     dtype=np.float64,
 ):
-    # PairwiseDistancesArgKmin must be translation invariant.
-
     rng = np.random.RandomState(0)
     spread = 1000
     X_translated = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
@@ -347,7 +341,7 @@ def test_argkmin_translation_invariance(
             row_idx, argkmin_indices_ref[row_idx]
         ]
 
-    argkmin_indices, argkmin_distances = PairwiseDistancesReduction.compute(
+    argkmin_indices, argkmin_distances = PairwiseDistancesArgKmin.compute(
         X_translated,
         Y_translated,
         k,
@@ -359,6 +353,6 @@ def test_argkmin_translation_invariance(
         strategy=strategy,
     )
 
-    ASSERT_RESULT[PairwiseDistancesReduction](
+    ASSERT_RESULT[PairwiseDistancesArgKmin](
         argkmin_distances, argkmin_distances_ref, argkmin_indices, argkmin_indices_ref
     )

From 38715d2e5a06adef79fdf1e12de44555f9d4ae7d Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 5 Jan 2022 10:24:36 +0100
Subject: [PATCH 19/22] Simpler variable names

---
 .../tests/test_pairwise_distances_reduction.py     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index eec8838e2f20d..c59d2f765eadb 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -320,18 +320,18 @@ def test_pairwise_distances_argkmin(
 ):
     rng = np.random.RandomState(0)
     spread = 1000
-    X_translated = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
-    Y_translated = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
+    X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
+    Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
 
     # Haversine distance only accepts 2D data
     if metric == "haversine":
-        X_translated = np.ascontiguousarray(X_translated[:, :2])
-        Y_translated = np.ascontiguousarray(Y_translated[:, :2])
+        X = np.ascontiguousarray(X[:, :2])
+        Y = np.ascontiguousarray(Y[:, :2])
 
     metric_kwargs = _get_dummy_metric_params_list(metric, n_features)[0]
 
     # Reference for argkmin results
-    dist_matrix = cdist(X_translated, Y_translated, metric=metric, **metric_kwargs)
+    dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
     # Taking argkmin (indices of the k smallest values)
     argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
     # Getting the associated distances
@@ -342,8 +342,8 @@ def test_pairwise_distances_argkmin(
         ]
 
     argkmin_indices, argkmin_distances = PairwiseDistancesArgKmin.compute(
-        X_translated,
-        Y_translated,
+        X,
+        Y,
         k,
         metric=metric,
         metric_kwargs=metric_kwargs,

From ce986d5310dddadebcf9121d46cdee7185f6416c Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 5 Jan 2022 13:33:30 +0100
Subject: [PATCH 20/22] fixup! TST Adapt test for PairwiseDistancesArgKmin
 translation invariance

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index c59d2f765eadb..06439ac7e180a 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -24,9 +24,12 @@
     "euclidean",
     "minkowski",
     "seuclidean",
-    "wminkowski",
 ]
 
+# TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0.
+if sp_version < parse_version("1.8.0.dev0"):
+    CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS.append("wminkowski")
+
 
 def _get_dummy_metric_params_list(metric: str, n_features: int):
     """Return list of dummy DistanceMetric kwargs for tests."""

From 70a28b7b6f19a76acf4634c1b33cb789b5440788 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 5 Jan 2022 14:28:48 +0100
Subject: [PATCH 21/22] fixup! fixup! TST Adapt test for
 PairwiseDistancesArgKmin translation invariance

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 06439ac7e180a..a4d51e4662740 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -26,10 +26,6 @@
     "seuclidean",
 ]
 
-# TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0.
-if sp_version < parse_version("1.8.0.dev0"):
-    CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS.append("wminkowski")
-
 
 def _get_dummy_metric_params_list(metric: str, n_features: int):
     """Return list of dummy DistanceMetric kwargs for tests."""
@@ -338,7 +334,7 @@ def test_pairwise_distances_argkmin(
     # Taking argkmin (indices of the k smallest values)
     argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
     # Getting the associated distances
-    argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float)
+    argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64)
     for row_idx in range(argkmin_indices_ref.shape[0]):
         argkmin_distances_ref[row_idx] = dist_matrix[
             row_idx, argkmin_indices_ref[row_idx]

From 06ca86936e52f5fe74f8d4791063d011d065da30 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 5 Jan 2022 14:42:27 +0100
Subject: [PATCH 22/22] Use correct orthograph for 'Callback'

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx
index 099c39df77604..830df08e1a952 100644
--- a/sklearn/metrics/_pairwise_distances_reduction.pyx
+++ b/sklearn/metrics/_pairwise_distances_reduction.pyx
@@ -389,7 +389,7 @@ cdef class PairwiseDistancesReduction:
         return
 
     def _finalize_results(self, bint return_distance):
-        """Call-back adapting datastructures before returning results.
+        """Callback adapting datastructures before returning results.
 
         This must be implemented in subclasses.
         """