From a068e51f65d8b8981224295ccb76812f462cda02 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 18 Oct 2022 19:35:46 -0400
Subject: [PATCH 01/25] Improved documentation

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 11 ++++-------
 sklearn/cluster/_hdbscan/hdbscan.py        |  8 ++++++++
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index 64aa9573e103a..4118732d6e623 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -4,16 +4,13 @@
 # License: 3-clause BSD
 
 import numpy as np
-from cython.parallel cimport prange
+from scipy.sparse import issparse
+from ...neighbors import BallTree, KDTree
+
 cimport numpy as cnp
+from cython.parallel cimport prange
 from libc.math cimport isfinite
 
-import gc
-
-from scipy.sparse import issparse
-from scipy.spatial.distance import pdist, squareform
-
-from ...neighbors import BallTree, KDTree
 
 def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0):
     """Compute the weighted adjacency matrix of the mutual reachability
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 79beead943898..1658f61b52aed 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -130,6 +130,14 @@ def _hdbscan_brute(
     # max_dist is only relevant for sparse and is ignored for dense
     max_dist = metric_params.get("max_dist", 0.0)
     sparse = issparse(distance_matrix)
+
+    # TODO: Investigate whether it is worth implementing a PWD backend for the
+    # combined operations of:
+    #   - The pairwise distance calculation
+    #   - The element-wise mutual-reachability calculation
+    # I suspect this would be better handled as one composite Cython routine to
+    # minimize memory-movement, however I (@micky774) am unsure whether it is
+    # narrow enough of a scope for the current PWD backend.
     distance_matrix = distance_matrix.tolil() if sparse else distance_matrix
 
     # Note that `distance_matrix` is manipulated in-place, however we do not

From 06152a302916cb1459f9a1589aa6cfad74ccdff4 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 18 Oct 2022 20:20:25 -0400
Subject: [PATCH 02/25] Updated comment

---
 sklearn/cluster/_hdbscan/hdbscan.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 1658f61b52aed..4a8760503ae40 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -137,7 +137,8 @@ def _hdbscan_brute(
     #   - The element-wise mutual-reachability calculation
     # I suspect this would be better handled as one composite Cython routine to
     # minimize memory-movement, however I (@micky774) am unsure whether it is
-    # narrow enough of a scope for the current PWD backend.
+    # narrow enough of a scope for the current PWD backend, or if it is better
+    # as a separate utility.
     distance_matrix = distance_matrix.tolil() if sparse else distance_matrix
 
     # Note that `distance_matrix` is manipulated in-place, however we do not

From 0ab847cdb8a792e812ee95a5adcc034d6523c031 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Oct 2022 15:40:55 +0200
Subject: [PATCH 03/25] MAINT further style improvement

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 179 +++++++++++++--------
 sklearn/cluster/_hdbscan/hdbscan.py        |  22 +--
 2 files changed, 127 insertions(+), 74 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index 64aa9573e103a..0cf3f99fdc2a7 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -1,44 +1,54 @@
-# mutual reachability distance compiutations
+# mutual reachability distance computations
 # Authors: Leland McInnes <leland.mcinnes@gmail.com>
 #          Meekail Zain <zainmeekail@gmail.com>
 # License: 3-clause BSD
 
 import numpy as np
-from cython.parallel cimport prange
+from scipy.sparse import issparse
+
+from ...neighbors import BallTree, KDTree
+
 cimport numpy as cnp
-from libc.math cimport isfinite
+from cython.parallel cimport prange
+from libc.math cimport isfinite, INFINITY
 
-import gc
 
-from scipy.sparse import issparse
-from scipy.spatial.distance import pdist, squareform
+def mutual_reachability_graph(
+    distance_matrix, n_neighbors=5, max_distance=0.0, copy=False
+):
+    """Compute the weighted adjacency matrix of the mutual reachability graph.
 
-from ...neighbors import BallTree, KDTree
+    The mutual reachability distance used to build the graph is defined as::
 
-def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0):
-    """Compute the weighted adjacency matrix of the mutual reachability
-    graph of a distance matrix. Note that computation is performed in-place for
-    `distance_matrix`. If out-of-place computation is required, pass a copy to
-    this function.
+        max(d_core(x_p), d_core(x_q), d(x_p, x_q))
+
+    and the core distance `d_core` is defined as the distance between a point
+    `x_p` and its k-th nearest neighbor.
 
     Parameters
     ----------
-    distance_matrix : ndarray or sparse matrix of shape (n_samples, n_samples)
+    distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples)
         Array of distances between samples. If sparse, the array must be in
         `LIL` format.
 
-    min_points : int, default=5
+    n_neighbors : int, default=5
         The number of points in a neighbourhood for a point to be considered
         a core point.
 
-    max_dist : float, default=0.0
+    max_distance : float, default=0.0
         The distance which `np.inf` is replaced with. When the true mutual-
         reachability distance is measured to be infinite, it is instead
-        truncated to `max_dist`.
+        truncated to `max_dist`. Only used when `distance_matrix` is a sparse
+        matrix.
+
+    copy : bool, default=False
+        Whether or not to compute the mutual reachinbility graph in-place, i.e.
+        modifying directly `distance_matrix`.
 
     Returns
     -------
-    mututal_reachability: ndarray of shape (n_samples, n_samples)
+    mututal_reachability_graph: {ndarray, sparse matrix} of shape \
+            (n_samples, n_samples)
         Weighted adjacency matrix of the mutual reachability graph.
 
     References
@@ -48,78 +58,121 @@ def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0):
        In Pacific-Asia Conference on Knowledge Discovery and Data Mining
        (pp. 160-172). Springer Berlin Heidelberg.
     """
-    # Account for index offset
-    min_points -= 1
+    if copy:
+        distance_matrix = distance_matrix.copy()
 
-    # Note that in both routines `distance_matrix` is operated on in-place. At
-    # this point, if out-of-place operation is desired then this function
-    # should have been passed a copy.
     if issparse(distance_matrix):
-        return _sparse_mutual_reachability(
-            distance_matrix,
-            min_points=min_points,
-            max_dist=max_dist
+        # FIXME: since we convert to a CSR matrix then we do not make the operation
+        # in-place.
+        return _sparse_mutual_reachability_graph(
+            distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance
         ).tocsr()
 
-    return _dense_mutual_reachability(distance_matrix, min_points=min_points)
+    return _dense_mutual_reachability_graph(distance_matrix, n_neighbors=n_neighbors)
+
 
-cdef _dense_mutual_reachability(
+cdef _dense_mutual_reachability_graph(
     cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix,
-    cnp.intp_t min_points=5
+    cnp.intp_t n_neighbors=5
 ):
-    cdef cnp.intp_t i, j, n_samples = distance_matrix.shape[0]
-    cdef cnp.float64_t mr_dist
-    cdef cnp.float64_t[:] core_distances
+    """Dense implementation of mutual reachability graph.
+
+    The computation is done in-place, i.e. the distance matrix is modified
+    directly.
+
+    Parameters
+    ----------
+    distance_matrix : ndarray of shape (n_samples, n_samples)
+        Array of distances between samples.
+
+    n_neighbors : int, default=5
+        The number of points in a neighbourhood for a point to be considered
+        a core point.
+
+    Returns
+    -------
+    mututal_reachability_graph : ndarray of shape (n_samples, n_samples)
+        Weighted adjacency matrix of the mutual reachability graph. This object
+        is the same as `distance_matrix` since the operation is done in-place.
+    """
+    cdef:
+        cnp.intp_t i, j, n_samples = distance_matrix.shape[0]
+        cnp.intp_t farther_neighbor_idx = n_neighbors - 1
+        cnp.float64_t mutual_reachibility_distance
+        cnp.float64_t[:] core_distances
 
-    # Compute the core distances for all samples `x_p` corresponding
-    # to the distance of the k-th farthest neighbours (including
-    # `x_p`).
     core_distances = np.partition(
-        distance_matrix,
-        min_points,
-        axis=0,
-    )[min_points]
+        distance_matrix, farther_neighbor_idx, axis=0
+    )[farther_neighbor_idx]
 
     with nogil:
         for i in range(n_samples):
             for j in prange(n_samples):
-                mr_dist = max(
+                mutual_reachibility_distance = max(
                     core_distances[i],
                     core_distances[j],
-                    distance_matrix[i, j]
+                    distance_matrix[i, j],
                 )
-                distance_matrix[i, j] = mr_dist
+                distance_matrix[i, j] = mutual_reachibility_distance
     return distance_matrix
 
-# Assumes LIL format.
+
 # TODO: Rewrite for CSR.
-cdef _sparse_mutual_reachability(
+cdef _sparse_mutual_reachability_graph(
     object distance_matrix,
-    cnp.intp_t min_points=5,
-    cnp.float64_t max_dist=0.
+    cnp.intp_t n_neighbors=5,
+    cnp.float64_t max_distance=0.0,
 ):
-    cdef cnp.intp_t i, j, n, n_samples = distance_matrix.shape[0]
-    cdef cnp.float64_t mr_dist
-    cdef cnp.float64_t[:] core_distances
-    cdef cnp.int32_t[:] nz_row_data, nz_col_data
+    """Sparse implementation of mutual reachability graph.
+
+    The computation is done in-place, i.e. the distance matrix is modified
+    directly. This implementation only accepts `LIL` format sparse matrices.
+
+    Parameters
+    ----------
+    distance_matrix : sparse matrix of shape (n_samples, n_samples)
+        Sparse matrix of distances between samples. The sparse format should
+        be `LIL`.
+
+    n_neighbors : int, default=5
+        The number of points in a neighbourhood for a point to be considered
+        a core point.
+
+    Returns
+    -------
+    mututal_reachability_graph : sparse matrix of shape (n_samples, n_samples)
+        Weighted adjacency matrix of the mutual reachability graph. This object
+        is the same as `distance_matrix` since the operation is done in-place.
+    """
+    cdef:
+        cnp.intp_t i, j, sample_idx, n_samples = distance_matrix.shape[0]
+        list row_distances
+        cnp.intp_t farther_neighbor_idx = n_neighbors - 1
+        cnp.float64_t mutual_reachibility_distance
+        cnp.float64_t[:] core_distances
+        cnp.int32_t[:] nz_row_data, nz_col_data
+
     core_distances = np.empty(n_samples, dtype=np.float64)
 
     for i in range(n_samples):
-        if min_points < len(distance_matrix.data[i]):
+        row_distances = distance_matrix.data[i]
+        if farther_neighbor_idx < len(row_distances):
             core_distances[i] = np.partition(
-                distance_matrix.data[i],
-                min_points
-            )[min_points]
+                row_distances, farther_neighbor_idx
+            )[farther_neighbor_idx]
         else:
-            core_distances[i] = np.infty
+            core_distances[i] = INFINITY
 
     nz_row_data, nz_col_data = distance_matrix.nonzero()
-    for n in range(nz_row_data.shape[0]):
-        i = nz_row_data[n]
-        j = nz_col_data[n]
-        mr_dist = max(core_distances[i], core_distances[j], distance_matrix[i, j])
-        if isfinite(mr_dist):
-            distance_matrix[i, j] = mr_dist
-        elif max_dist > 0:
-            distance_matrix[i, j] = max_dist
+    for sample_idx in range(nz_row_data.shape[0]):
+        i, j = nz_row_data[sample_idx], nz_col_data[sample_idx]
+        mutual_reachibility_distance = max(
+            core_distances[i], core_distances[j], distance_matrix[i, j]
+        )
+        if isfinite(mutual_reachibility_distance):
+            distance_matrix[i, j] = mutual_reachibility_distance
+        elif max_distance > 0:
+            # TODO: it seems that we assume that distance_matrix is initialized
+            # with zeros.
+            distance_matrix[i, j] = max_distance
     return distance_matrix
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 79beead943898..fe2a641bb07f5 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -22,7 +22,7 @@
 from ...utils._param_validation import Interval, StrOptions
 from ...utils.validation import _assert_all_finite
 from ._linkage import label, mst_from_distance_matrix, mst_from_data_matrix
-from ._reachability import mutual_reachability
+from ._reachability import mutual_reachability_graph
 from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut
 
 FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics
@@ -63,7 +63,7 @@ def _brute_mst(mutual_reachability, min_samples, sparse=False):
             f"There exists points with fewer than {min_samples} neighbors. Ensure"
             " your distance matrix has non-zero values for at least"
             f" `min_sample`={min_samples} neighbors for each points (i.e. K-nn"
-            " graph), or specify a `max_dist` in `metric_params` to use when"
+            " graph), or specify a `max_distance` in `metric_params` to use when"
             " distances are missing."
         )
 
@@ -108,7 +108,7 @@ def _process_mst(min_spanning_tree):
 
 def _hdbscan_brute(
     X,
-    min_samples=5,
+    n_neighbors=5,
     alpha=None,
     metric="euclidean",
     n_jobs=None,
@@ -128,17 +128,17 @@ def _hdbscan_brute(
     distance_matrix /= alpha
 
     # max_dist is only relevant for sparse and is ignored for dense
-    max_dist = metric_params.get("max_dist", 0.0)
+    max_distance = metric_params.get("max_distance", 0.0)
     sparse = issparse(distance_matrix)
     distance_matrix = distance_matrix.tolil() if sparse else distance_matrix
 
     # Note that `distance_matrix` is manipulated in-place, however we do not
     # need it for anything else past this point, hence the operation is safe.
-    mutual_reachability_ = mutual_reachability(
-        distance_matrix, min_points=min_samples, max_dist=max_dist
+    mutual_reachability_ = mutual_reachability_graph(
+        distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance
     )
     min_spanning_tree = _brute_mst(
-        mutual_reachability_, min_samples=min_samples, sparse=sparse
+        mutual_reachability_, min_samples=n_neighbors, sparse=sparse
     )
     # Warn if the MST couldn't be constructed around the missing distances
     if np.isinf(min_spanning_tree.T[2]).any():
@@ -156,7 +156,7 @@ def _hdbscan_brute(
 def _hdbscan_prims(
     X,
     algo,
-    min_samples=5,
+    n_neighbors=5,
     alpha=1.0,
     metric="euclidean",
     leaf_size=40,
@@ -168,7 +168,7 @@ def _hdbscan_prims(
 
     # Get distance to kth nearest neighbour
     nbrs = NearestNeighbors(
-        n_neighbors=min_samples,
+        n_neighbors=n_neighbors,
         algorithm=algo,
         leaf_size=leaf_size,
         metric=metric,
@@ -177,7 +177,7 @@ def _hdbscan_prims(
         p=None,
     ).fit(X)
 
-    neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True)
+    neighbors_distances, _ = nbrs.kneighbors(X, n_neighbors, return_distance=True)
     core_distances = np.ascontiguousarray(neighbors_distances[:, -1])
     dist_metric = DistanceMetric.get_metric(metric, **metric_params)
 
@@ -590,7 +590,7 @@ def fit(self, X, y=None):
         mst_func = None
         kwargs = dict(
             X=X,
-            min_samples=self._min_samples,
+            n_neighbors=self._min_samples,
             alpha=self.alpha,
             metric=self.metric,
             n_jobs=self.n_jobs,

From d6a59a53be9dd0ea60e8e4b2a17d65c8d9398e40 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Oct 2022 15:54:33 +0200
Subject: [PATCH 04/25] FIX let's be consistent and call min_samples

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 20 ++++++++++----------
 sklearn/cluster/_hdbscan/hdbscan.py        | 14 +++++++-------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index 3c5a8b86a9f5a..fb9c288c039b1 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -12,7 +12,7 @@ from libc.math cimport isfinite, INFINITY
 
 
 def mutual_reachability_graph(
-    distance_matrix, n_neighbors=5, max_distance=0.0, copy=False
+    distance_matrix, min_samples=5, max_distance=0.0, copy=False
 ):
     """Compute the weighted adjacency matrix of the mutual reachability graph.
 
@@ -29,7 +29,7 @@ def mutual_reachability_graph(
         Array of distances between samples. If sparse, the array must be in
         `LIL` format.
 
-    n_neighbors : int, default=5
+    min_samples : int, default=5
         The number of points in a neighbourhood for a point to be considered
         a core point.
 
@@ -63,15 +63,15 @@ def mutual_reachability_graph(
         # FIXME: since we convert to a CSR matrix then we do not make the operation
         # in-place.
         return _sparse_mutual_reachability_graph(
-            distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance
+            distance_matrix, min_samples=min_samples, max_distance=max_distance
         ).tocsr()
 
-    return _dense_mutual_reachability_graph(distance_matrix, n_neighbors=n_neighbors)
+    return _dense_mutual_reachability_graph(distance_matrix, min_samples=min_samples)
 
 
 cdef _dense_mutual_reachability_graph(
     cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix,
-    cnp.intp_t n_neighbors=5
+    cnp.intp_t min_samples=5
 ):
     """Dense implementation of mutual reachability graph.
 
@@ -83,7 +83,7 @@ cdef _dense_mutual_reachability_graph(
     distance_matrix : ndarray of shape (n_samples, n_samples)
         Array of distances between samples.
 
-    n_neighbors : int, default=5
+    min_samples : int, default=5
         The number of points in a neighbourhood for a point to be considered
         a core point.
 
@@ -95,7 +95,7 @@ cdef _dense_mutual_reachability_graph(
     """
     cdef:
         cnp.intp_t i, j, n_samples = distance_matrix.shape[0]
-        cnp.intp_t farther_neighbor_idx = n_neighbors - 1
+        cnp.intp_t farther_neighbor_idx = min_samples - 1
         cnp.float64_t mutual_reachibility_distance
         cnp.float64_t[:] core_distances
 
@@ -118,7 +118,7 @@ cdef _dense_mutual_reachability_graph(
 # TODO: Rewrite for CSR.
 cdef _sparse_mutual_reachability_graph(
     object distance_matrix,
-    cnp.intp_t n_neighbors=5,
+    cnp.intp_t min_samples=5,
     cnp.float64_t max_distance=0.0,
 ):
     """Sparse implementation of mutual reachability graph.
@@ -132,7 +132,7 @@ cdef _sparse_mutual_reachability_graph(
         Sparse matrix of distances between samples. The sparse format should
         be `LIL`.
 
-    n_neighbors : int, default=5
+    min_samples : int, default=5
         The number of points in a neighbourhood for a point to be considered
         a core point.
 
@@ -145,7 +145,7 @@ cdef _sparse_mutual_reachability_graph(
     cdef:
         cnp.intp_t i, j, sample_idx, n_samples = distance_matrix.shape[0]
         list row_distances
-        cnp.intp_t farther_neighbor_idx = n_neighbors - 1
+        cnp.intp_t farther_neighbor_idx = min_samples - 1
         cnp.float64_t mutual_reachibility_distance
         cnp.float64_t[:] core_distances
         cnp.int32_t[:] nz_row_data, nz_col_data
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index d5da676409423..5ff89f68dcf8d 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -108,7 +108,7 @@ def _process_mst(min_spanning_tree):
 
 def _hdbscan_brute(
     X,
-    n_neighbors=5,
+    min_samples=5,
     alpha=None,
     metric="euclidean",
     n_jobs=None,
@@ -144,10 +144,10 @@ def _hdbscan_brute(
     # Note that `distance_matrix` is manipulated in-place, however we do not
     # need it for anything else past this point, hence the operation is safe.
     mutual_reachability_ = mutual_reachability_graph(
-        distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance
+        distance_matrix, min_samples=min_samples, max_distance=max_distance
     )
     min_spanning_tree = _brute_mst(
-        mutual_reachability_, min_samples=n_neighbors, sparse=sparse
+        mutual_reachability_, min_samples=min_samples, sparse=sparse
     )
     # Warn if the MST couldn't be constructed around the missing distances
     if np.isinf(min_spanning_tree.T[2]).any():
@@ -165,7 +165,7 @@ def _hdbscan_brute(
 def _hdbscan_prims(
     X,
     algo,
-    n_neighbors=5,
+    min_samples=5,
     alpha=1.0,
     metric="euclidean",
     leaf_size=40,
@@ -177,7 +177,7 @@ def _hdbscan_prims(
 
     # Get distance to kth nearest neighbour
     nbrs = NearestNeighbors(
-        n_neighbors=n_neighbors,
+        n_neighbors=min_samples,
         algorithm=algo,
         leaf_size=leaf_size,
         metric=metric,
@@ -186,7 +186,7 @@ def _hdbscan_prims(
         p=None,
     ).fit(X)
 
-    neighbors_distances, _ = nbrs.kneighbors(X, n_neighbors, return_distance=True)
+    neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True)
     core_distances = np.ascontiguousarray(neighbors_distances[:, -1])
     dist_metric = DistanceMetric.get_metric(metric, **metric_params)
 
@@ -599,7 +599,7 @@ def fit(self, X, y=None):
         mst_func = None
         kwargs = dict(
             X=X,
-            n_neighbors=self._min_samples,
+            min_samples=self._min_samples,
             alpha=self.alpha,
             metric=self.metric,
             n_jobs=self.n_jobs,

From e09ece767de7523f1eb0f911dcef1e80cf3d95c9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Oct 2022 18:07:14 +0200
Subject: [PATCH 05/25] TMP POC for CSC processing

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 78 +++++++++++++---------
 1 file changed, 46 insertions(+), 32 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index fb9c288c039b1..be10d28ab8555 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -6,11 +6,17 @@
 import numpy as np
 from scipy.sparse import issparse
 
+cimport cython
 cimport numpy as cnp
 from cython.parallel cimport prange
 from libc.math cimport isfinite, INFINITY
 
 
+ctypedef fused integral:
+    int
+    long long
+
+
 def mutual_reachability_graph(
     distance_matrix, min_samples=5, max_distance=0.0, copy=False
 ):
@@ -59,19 +65,30 @@ def mutual_reachability_graph(
     if copy:
         distance_matrix = distance_matrix.copy()
 
+    further_neighbor_idx = min_samples - 1
     if issparse(distance_matrix):
         # FIXME: since we convert to a CSR matrix then we do not make the operation
         # in-place.
-        return _sparse_mutual_reachability_graph(
-            distance_matrix, min_samples=min_samples, max_distance=max_distance
-        ).tocsr()
+        distance_matrix = distance_matrix.tocsc()
+        _sparse_mutual_reachability_graph(
+            distance_matrix.data,
+            distance_matrix.indices,
+            distance_matrix.indptr,
+            distance_matrix.shape,
+            further_neighbor_idx=further_neighbor_idx,
+            max_distance=max_distance,
+        )
+    else:
+        _dense_mutual_reachability_graph(
+            distance_matrix, further_neighbor_idx=further_neighbor_idx
+        )
+    return distance_matrix
 
-    return _dense_mutual_reachability_graph(distance_matrix, min_samples=min_samples)
 
 
 cdef _dense_mutual_reachability_graph(
     cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix,
-    cnp.intp_t min_samples=5
+    cnp.intp_t further_neighbor_idx=5
 ):
     """Dense implementation of mutual reachability graph.
 
@@ -95,13 +112,12 @@ cdef _dense_mutual_reachability_graph(
     """
     cdef:
         cnp.intp_t i, j, n_samples = distance_matrix.shape[0]
-        cnp.intp_t farther_neighbor_idx = min_samples - 1
         cnp.float64_t mutual_reachibility_distance
         cnp.float64_t[:] core_distances
 
     core_distances = np.partition(
-        distance_matrix, farther_neighbor_idx, axis=0
-    )[farther_neighbor_idx]
+        distance_matrix, further_neighbor_idx, axis=0
+    )[further_neighbor_idx]
 
     with nogil:
         for i in range(n_samples):
@@ -112,13 +128,15 @@ cdef _dense_mutual_reachability_graph(
                     distance_matrix[i, j],
                 )
                 distance_matrix[i, j] = mutual_reachibility_distance
-    return distance_matrix
 
 
 # TODO: Rewrite for CSR.
 cdef _sparse_mutual_reachability_graph(
-    object distance_matrix,
-    cnp.intp_t min_samples=5,
+    cnp.ndarray[cnp.float64_t, ndim=1, mode="c"] data,
+    cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indices,
+    cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indptr,
+    cnp.intp_t n_samples,
+    cnp.intp_t further_neighbor_idx=5,
     cnp.float64_t max_distance=0.0,
 ):
     """Sparse implementation of mutual reachability graph.
@@ -143,34 +161,30 @@ cdef _sparse_mutual_reachability_graph(
         is the same as `distance_matrix` since the operation is done in-place.
     """
     cdef:
-        cnp.intp_t i, j, sample_idx, n_samples = distance_matrix.shape[0]
-        list row_distances
-        cnp.intp_t farther_neighbor_idx = min_samples - 1
+        cnp.intp_t i, col_ind, row_ind
         cnp.float64_t mutual_reachibility_distance
         cnp.float64_t[:] core_distances
-        cnp.int32_t[:] nz_row_data, nz_col_data
+        cnp.float64_t[:] col_data
+        cnp.int32_t[:] row_indices
 
     core_distances = np.empty(n_samples, dtype=np.float64)
 
     for i in range(n_samples):
-        row_distances = distance_matrix.data[i]
-        if farther_neighbor_idx < len(row_distances):
+        col_data = data[indptr[i]:indptr[i + 1]]
+        if further_neighbor_idx < col_data.size:
             core_distances[i] = np.partition(
-                row_distances, farther_neighbor_idx
-            )[farther_neighbor_idx]
+                col_data, further_neighbor_idx
+            )[further_neighbor_idx]
         else:
             core_distances[i] = INFINITY
 
-    nz_row_data, nz_col_data = distance_matrix.nonzero()
-    for sample_idx in range(nz_row_data.shape[0]):
-        i, j = nz_row_data[sample_idx], nz_col_data[sample_idx]
-        mutual_reachibility_distance = max(
-            core_distances[i], core_distances[j], distance_matrix[i, j]
-        )
-        if isfinite(mutual_reachibility_distance):
-            distance_matrix[i, j] = mutual_reachibility_distance
-        elif max_distance > 0:
-            # TODO: it seems that we assume that distance_matrix is initialized
-            # with zeros.
-            distance_matrix[i, j] = max_distance
-    return distance_matrix
+    for col_ind in range(n_samples):
+        for i in range(indptr[col_ind], indptr[col_ind + 1]):
+            row_ind = indices[i]
+            mutual_reachibility_distance = max(
+                core_distances[col_ind], core_distances[row_ind], data[i]
+            )
+            if isfinite(mutual_reachibility_distance):
+                data[i] = mutual_reachibility_distance
+            elif max_distance > 0:
+                data[i] = max_distance

From 1cb0db82f91e5bfee8cf445786aacf381e8911d8 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Oct 2022 11:34:38 +0200
Subject: [PATCH 06/25] ENH CSR, fused type, no-copy

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 101 ++++++++++-----------
 sklearn/cluster/_hdbscan/hdbscan.py        |  35 ++-----
 2 files changed, 59 insertions(+), 77 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index be10d28ab8555..d347882d5bc82 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -7,10 +7,12 @@ import numpy as np
 from scipy.sparse import issparse
 
 cimport cython
+from cython cimport floating
 cimport numpy as cnp
 from cython.parallel cimport prange
 from libc.math cimport isfinite, INFINITY
 
+cnp.import_array()
 
 ctypedef fused integral:
     int
@@ -18,7 +20,7 @@ ctypedef fused integral:
 
 
 def mutual_reachability_graph(
-    distance_matrix, min_samples=5, max_distance=0.0, copy=False
+    distance_matrix, min_samples=5, max_distance=0.0
 ):
     """Compute the weighted adjacency matrix of the mutual reachability graph.
 
@@ -29,11 +31,13 @@ def mutual_reachability_graph(
     and the core distance `d_core` is defined as the distance between a point
     `x_p` and its k-th nearest neighbor.
 
+    Note that all computations are done in-place.
+
     Parameters
     ----------
     distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples)
         Array of distances between samples. If sparse, the array must be in
-        `LIL` format.
+        `CSR` format.
 
     min_samples : int, default=5
         The number of points in a neighbourhood for a point to be considered
@@ -45,10 +49,6 @@ def mutual_reachability_graph(
         truncated to `max_dist`. Only used when `distance_matrix` is a sparse
         matrix.
 
-    copy : bool, default=False
-        Whether or not to compute the mutual reachinbility graph in-place, i.e.
-        modifying directly `distance_matrix`.
-
     Returns
     -------
     mututal_reachability_graph: {ndarray, sparse matrix} of shape \
@@ -62,19 +62,17 @@ def mutual_reachability_graph(
        In Pacific-Asia Conference on Knowledge Discovery and Data Mining
        (pp. 160-172). Springer Berlin Heidelberg.
     """
-    if copy:
-        distance_matrix = distance_matrix.copy()
-
     further_neighbor_idx = min_samples - 1
     if issparse(distance_matrix):
-        # FIXME: since we convert to a CSR matrix then we do not make the operation
-        # in-place.
-        distance_matrix = distance_matrix.tocsc()
+        if distance_matrix.format != "csr":
+            raise ValueError(
+                "Only sparse CSR matrices are supported for `distance_matrix`."
+            )
         _sparse_mutual_reachability_graph(
             distance_matrix.data,
             distance_matrix.indices,
             distance_matrix.indptr,
-            distance_matrix.shape,
+            distance_matrix.shape[0],
             further_neighbor_idx=further_neighbor_idx,
             max_distance=max_distance,
         )
@@ -86,9 +84,9 @@ def mutual_reachability_graph(
 
 
 
-cdef _dense_mutual_reachability_graph(
-    cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix,
-    cnp.intp_t further_neighbor_idx=5
+def _dense_mutual_reachability_graph(
+    cnp.ndarray[dtype=floating, ndim=2] distance_matrix,
+    cnp.intp_t further_neighbor_idx,
 ):
     """Dense implementation of mutual reachability graph.
 
@@ -100,24 +98,20 @@ cdef _dense_mutual_reachability_graph(
     distance_matrix : ndarray of shape (n_samples, n_samples)
         Array of distances between samples.
 
-    min_samples : int, default=5
-        The number of points in a neighbourhood for a point to be considered
-        a core point.
-
-    Returns
-    -------
-    mututal_reachability_graph : ndarray of shape (n_samples, n_samples)
-        Weighted adjacency matrix of the mutual reachability graph. This object
-        is the same as `distance_matrix` since the operation is done in-place.
+    further_neighbor_idx : int
+        The index of the furthest neighbor to use to define the core distances.
     """
     cdef:
         cnp.intp_t i, j, n_samples = distance_matrix.shape[0]
-        cnp.float64_t mutual_reachibility_distance
-        cnp.float64_t[:] core_distances
+        floating mutual_reachibility_distance
+        floating[:] core_distances
 
+    # We assume that the distance matrix is symmetric. We choose to sort every
+    # row to have the same implementation than the sparse case that requires
+    # CSR matrix.
     core_distances = np.partition(
-        distance_matrix, further_neighbor_idx, axis=0
-    )[further_neighbor_idx]
+        distance_matrix, further_neighbor_idx, axis=1
+    )[:, further_neighbor_idx]
 
     with nogil:
         for i in range(n_samples):
@@ -130,44 +124,47 @@ cdef _dense_mutual_reachability_graph(
                 distance_matrix[i, j] = mutual_reachibility_distance
 
 
-# TODO: Rewrite for CSR.
-cdef _sparse_mutual_reachability_graph(
-    cnp.ndarray[cnp.float64_t, ndim=1, mode="c"] data,
-    cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indices,
-    cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indptr,
+def _sparse_mutual_reachability_graph(
+    cnp.ndarray[floating, ndim=1, mode="c"] data,
+    cnp.ndarray[integral, ndim=1, mode="c"] indices,
+    cnp.ndarray[integral, ndim=1, mode="c"] indptr,
     cnp.intp_t n_samples,
-    cnp.intp_t further_neighbor_idx=5,
-    cnp.float64_t max_distance=0.0,
+    cnp.intp_t further_neighbor_idx,
+    cnp.float64_t max_distance,
 ):
     """Sparse implementation of mutual reachability graph.
 
     The computation is done in-place, i.e. the distance matrix is modified
-    directly. This implementation only accepts `LIL` format sparse matrices.
+    directly. This implementation only accepts `CSR` format sparse matrices.
 
     Parameters
     ----------
     distance_matrix : sparse matrix of shape (n_samples, n_samples)
         Sparse matrix of distances between samples. The sparse format should
-        be `LIL`.
+        be `CSR`.
 
-    min_samples : int, default=5
-        The number of points in a neighbourhood for a point to be considered
-        a core point.
+    further_neighbor_idx : int
+        The index of the furthest neighbor to use to define the core distances.
 
-    Returns
-    -------
-    mututal_reachability_graph : sparse matrix of shape (n_samples, n_samples)
-        Weighted adjacency matrix of the mutual reachability graph. This object
-        is the same as `distance_matrix` since the operation is done in-place.
+    max_distance : float
+        The distance which `np.inf` is replaced with. When the true mutual-
+        reachability distance is measured to be infinite, it is instead
+        truncated to `max_dist`. Only used when `distance_matrix` is a sparse
+        matrix.
     """
     cdef:
-        cnp.intp_t i, col_ind, row_ind
-        cnp.float64_t mutual_reachibility_distance
-        cnp.float64_t[:] core_distances
-        cnp.float64_t[:] col_data
-        cnp.int32_t[:] row_indices
+        cnp.intp_t i, col_ind
+        integral row_ind
+        floating mutual_reachibility_distance
+        floating[:] core_distances
+        floating[:] col_data
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
 
-    core_distances = np.empty(n_samples, dtype=np.float64)
+    core_distances = np.empty(n_samples, dtype=dtype)
 
     for i in range(n_samples):
         col_data = data[indptr[i]:indptr[i + 1]]
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 5ff89f68dcf8d..753c4145ccc70 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -46,8 +46,8 @@
 }
 
 
-def _brute_mst(mutual_reachability, min_samples, sparse=False):
-    if not sparse:
+def _brute_mst(mutual_reachability, min_samples):
+    if not issparse(mutual_reachability):
         return mst_from_distance_matrix(mutual_reachability)
 
     # Check connected component on mutual reachability
@@ -116,10 +116,6 @@ def _hdbscan_brute(
     **metric_params,
 ):
     if metric == "precomputed":
-        # Treating this case explicitly, instead of letting
-        # sklearn.metrics.pairwise_distances handle it,
-        # enables the usage of numpy.inf in the distance
-        # matrix to indicate missing distance information.
         distance_matrix = X.copy() if copy else X
     else:
         distance_matrix = pairwise_distances(
@@ -127,28 +123,18 @@ def _hdbscan_brute(
         )
     distance_matrix /= alpha
 
-    # max_dist is only relevant for sparse and is ignored for dense
     max_distance = metric_params.get("max_distance", 0.0)
-    sparse = issparse(distance_matrix)
-
-    # TODO: Investigate whether it is worth implementing a PWD backend for the
-    # combined operations of:
-    #   - The pairwise distance calculation
-    #   - The element-wise mutual-reachability calculation
-    # I suspect this would be better handled as one composite Cython routine to
-    # minimize memory-movement, however I (@micky774) am unsure whether it is
-    # narrow enough of a scope for the current PWD backend, or if it is better
-    # as a separate utility.
-    distance_matrix = distance_matrix.tolil() if sparse else distance_matrix
+    if issparse(distance_matrix) and distance_matrix.format != "csr":
+        # we need CSR format to avoid a conversion in `_brute_mst` when calling
+        # `csgraph.connected_components`
+        distance_matrix = distance_matrix.tocsr()
 
     # Note that `distance_matrix` is manipulated in-place, however we do not
     # need it for anything else past this point, hence the operation is safe.
     mutual_reachability_ = mutual_reachability_graph(
         distance_matrix, min_samples=min_samples, max_distance=max_distance
     )
-    min_spanning_tree = _brute_mst(
-        mutual_reachability_, min_samples=min_samples, sparse=sparse
-    )
+    min_spanning_tree = _brute_mst(mutual_reachability_, min_samples=min_samples)
     # Warn if the MST couldn't be constructed around the missing distances
     if np.isinf(min_spanning_tree.T[2]).any():
         warn(
@@ -358,10 +344,9 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
     copy : bool, default=False
         If `copy=True` then any time an in-place modifications would be made
         that would overwrite data passed to :term:`fit`, a copy will first be
-        made, guaranteeing that the original data will be unchanged. Currently
-        this only makes a difference when passing in a dense precomputed
-        distance array (i.e. when `metric="precomputed"`) and using the
-        `"brute"` algorithm (see `algorithm` for details).
+        made, guaranteeing that the original data will be unchanged.
+        Currently, it only applies with `metric="precomputed"`, passing a dense
+        array or a sparse matrix of format CSR and algorithm used is `"brute"`.
 
     Attributes
     ----------

From 8a38591a81308263d1943a807cff79621fc89167 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Oct 2022 11:36:00 +0200
Subject: [PATCH 07/25] iter

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index d347882d5bc82..12fe7d5309152 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -1,6 +1,7 @@
 # mutual reachability distance computations
 # Authors: Leland McInnes <leland.mcinnes@gmail.com>
 #          Meekail Zain <zainmeekail@gmail.com>
+#          Guillaume Lemaitre <g.lemaitre58@gmail.com>
 # License: 3-clause BSD
 
 import numpy as np

From 41cb21ed37a4debbf5207d0b4e207582a5e17589 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Oct 2022 11:38:59 +0200
Subject: [PATCH 08/25] homogeneous dtype for max_distance

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index 12fe7d5309152..e451f124d270f 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -84,7 +84,6 @@ def mutual_reachability_graph(
     return distance_matrix
 
 
-
 def _dense_mutual_reachability_graph(
     cnp.ndarray[dtype=floating, ndim=2] distance_matrix,
     cnp.intp_t further_neighbor_idx,
@@ -131,7 +130,7 @@ def _sparse_mutual_reachability_graph(
     cnp.ndarray[integral, ndim=1, mode="c"] indptr,
     cnp.intp_t n_samples,
     cnp.intp_t further_neighbor_idx,
-    cnp.float64_t max_distance,
+    floating max_distance,
 ):
     """Sparse implementation of mutual reachability graph.
 

From c510bf85ece258568f41ce38c45d414cd710e388 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Oct 2022 16:33:05 +0200
Subject: [PATCH 09/25] TST add a couple of tests (wip)

---
 sklearn/cluster/_hdbscan/tests/__init__.py    |  0
 .../_hdbscan/tests/test_reachibility.py       | 50 +++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 sklearn/cluster/_hdbscan/tests/__init__.py
 create mode 100644 sklearn/cluster/_hdbscan/tests/test_reachibility.py

diff --git a/sklearn/cluster/_hdbscan/tests/__init__.py b/sklearn/cluster/_hdbscan/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/sklearn/cluster/_hdbscan/tests/test_reachibility.py
new file mode 100644
index 0000000000000..9d610c9a10f2c
--- /dev/null
+++ b/sklearn/cluster/_hdbscan/tests/test_reachibility.py
@@ -0,0 +1,50 @@
+import numpy as np
+import pytest
+
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+)
+
+from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
+
+
+def test_mutual_reachability_graph_error_sparse_format():
+    """Check that we raise an error if the sparse format is not CSR."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 10)
+    X = X.T @ X
+    np.fill_diagonal(X, 0.0)
+    X = _convert_container(X, "sparse_csc")
+
+    err_msg = "Only sparse CSR matrices are supported"
+    with pytest.raises(ValueError, match=err_msg):
+        mutual_reachability_graph(X)
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
+def test_mutual_reachability_graph_inplace(array_type):
+    """Check that the operation is happening inplace."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 10)
+    X = X.T @ X
+    np.fill_diagonal(X, 0.0)
+    X = _convert_container(X, array_type)
+
+    mr_graph = mutual_reachability_graph(X)
+
+    assert id(mr_graph) == id(X)
+
+
+def test_mutual_reachability_graph_equivalence_dense_sparse():
+    """Check that we get the same results for dense and sparse implementation."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(5, 5)
+    X_dense = X.T @ X
+    np.fill_diagonal(X_dense, 0.0)
+    X_sparse = _convert_container(X_dense, "sparse_csr")
+
+    mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
+    mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)
+
+    assert_allclose(mr_graph_dense, mr_graph_sparse.A)

From 85c1914afa9b6f9e6a48678fe4f07db51d82b20a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Oct 2022 16:50:13 +0200
Subject: [PATCH 10/25] TST some more tests

---
 sklearn/cluster/_hdbscan/_reachability.pyx    | 29 ++++++++++---------
 .../_hdbscan/tests/test_reachibility.py       | 16 +++++++++-
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index e451f124d270f..d1716dd79e7fd 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -157,7 +157,7 @@ def _sparse_mutual_reachability_graph(
         integral row_ind
         floating mutual_reachibility_distance
         floating[:] core_distances
-        floating[:] col_data
+        floating[:] row_data
 
     if floating is float:
         dtype = np.float32
@@ -167,21 +167,22 @@ def _sparse_mutual_reachability_graph(
     core_distances = np.empty(n_samples, dtype=dtype)
 
     for i in range(n_samples):
-        col_data = data[indptr[i]:indptr[i + 1]]
-        if further_neighbor_idx < col_data.size:
+        row_data = data[indptr[i]:indptr[i + 1]]
+        if further_neighbor_idx < row_data.size:
             core_distances[i] = np.partition(
-                col_data, further_neighbor_idx
+                row_data, further_neighbor_idx
             )[further_neighbor_idx]
         else:
             core_distances[i] = INFINITY
 
-    for col_ind in range(n_samples):
-        for i in range(indptr[col_ind], indptr[col_ind + 1]):
-            row_ind = indices[i]
-            mutual_reachibility_distance = max(
-                core_distances[col_ind], core_distances[row_ind], data[i]
-            )
-            if isfinite(mutual_reachibility_distance):
-                data[i] = mutual_reachibility_distance
-            elif max_distance > 0:
-                data[i] = max_distance
+    with nogil:
+        for col_ind in range(n_samples):
+            for i in range(indptr[col_ind], indptr[col_ind + 1]):
+                row_ind = indices[i]
+                mutual_reachibility_distance = max(
+                    core_distances[col_ind], core_distances[row_ind], data[i]
+                )
+                if isfinite(mutual_reachibility_distance):
+                    data[i] = mutual_reachibility_distance
+                elif max_distance > 0:
+                    data[i] = max_distance
diff --git a/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/sklearn/cluster/_hdbscan/tests/test_reachibility.py
index 9d610c9a10f2c..c8ba28d0af25b 100644
--- a/sklearn/cluster/_hdbscan/tests/test_reachibility.py
+++ b/sklearn/cluster/_hdbscan/tests/test_reachibility.py
@@ -41,10 +41,24 @@ def test_mutual_reachability_graph_equivalence_dense_sparse():
     rng = np.random.RandomState(0)
     X = rng.randn(5, 5)
     X_dense = X.T @ X
-    np.fill_diagonal(X_dense, 0.0)
     X_sparse = _convert_container(X_dense, "sparse_csr")
 
     mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
     mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)
 
     assert_allclose(mr_graph_dense, mr_graph_sparse.A)
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_mutual_reachability_graph_preserve_dtype(array_type, dtype):
+    """Check that the computation preserve dtype thanks to fused types."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 10)
+    X = (X.T @ X).astype(dtype)
+    np.fill_diagonal(X, 0.0)
+    X = _convert_container(X, array_type)
+
+    assert X.dtype == dtype
+    mr_graph = mutual_reachability_graph(X)
+    assert mr_graph.dtype == dtype

From 9ba964d14803852f095909d428adf94ada2374b2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Oct 2022 16:53:01 +0200
Subject: [PATCH 11/25] fused type

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index d1716dd79e7fd..d925d1d22e62b 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -153,7 +153,7 @@ def _sparse_mutual_reachability_graph(
         matrix.
     """
     cdef:
-        cnp.intp_t i, col_ind
+        integral i, col_ind
         integral row_ind
         floating mutual_reachibility_distance
         floating[:] core_distances

From 0c65f8cc42217612654daf22805f28331e38fa95 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 20 Oct 2022 16:58:39 +0200
Subject: [PATCH 12/25] FIX put correct name on indices

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index d925d1d22e62b..c83fef742e82e 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -153,8 +153,7 @@ def _sparse_mutual_reachability_graph(
         matrix.
     """
     cdef:
-        integral i, col_ind
-        integral row_ind
+        integral i, col_ind, row_ind
         floating mutual_reachibility_distance
         floating[:] core_distances
         floating[:] row_data
@@ -176,11 +175,11 @@ def _sparse_mutual_reachability_graph(
             core_distances[i] = INFINITY
 
     with nogil:
-        for col_ind in range(n_samples):
-            for i in range(indptr[col_ind], indptr[col_ind + 1]):
-                row_ind = indices[i]
+        for row_ind in range(n_samples):
+            for i in range(indptr[row_ind], indptr[row_ind + 1]):
+                col_ind = indices[i]
                 mutual_reachibility_distance = max(
-                    core_distances[col_ind], core_distances[row_ind], data[i]
+                    core_distances[row_ind], core_distances[col_ind], data[i]
                 )
                 if isfinite(mutual_reachibility_distance):
                     data[i] = mutual_reachibility_distance

From b83f614cbce03728a0bbd164c985fdcdf8f6ce64 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Sat, 5 Nov 2022 14:08:21 -0400
Subject: [PATCH 13/25] Added validation for precomputed distance matrix

---
 sklearn/cluster/_hdbscan/hdbscan.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 753c4145ccc70..dda9ae8507236 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -116,6 +116,19 @@ def _hdbscan_brute(
     **metric_params,
 ):
     if metric == "precomputed":
+        if X.shape[0] != X.shape[1]:
+            raise ValueError(
+                "The precomputed distance matrix is expected to be symmetric, however"
+                f" it has shape {X.shape}. Please verify that the"
+                " distance matrix was constructed correctly."
+            )
+        if np.allclose(X, X.T):
+            raise ValueError(
+                "The precomputed distance matrix is expected to be symmetric, however"
+                " its values appear to be asymmetric. Please verify that the distance"
+                " matrix was constructed correctly."
+            )
+
         distance_matrix = X.copy() if copy else X
     else:
         distance_matrix = pairwise_distances(

From 2310e1e5bbad472def197f1da926da5136f8a0f1 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Sat, 5 Nov 2022 14:34:40 -0400
Subject: [PATCH 14/25] Fixed typo

---
 sklearn/cluster/_hdbscan/hdbscan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index dda9ae8507236..bf58b10e3ccfd 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -122,7 +122,7 @@ def _hdbscan_brute(
                 f" it has shape {X.shape}. Please verify that the"
                 " distance matrix was constructed correctly."
             )
-        if np.allclose(X, X.T):
+        if not np.allclose(X, X.T):
             raise ValueError(
                 "The precomputed distance matrix is expected to be symmetric, however"
                 " its values appear to be asymmetric. Please verify that the distance"

From b99ba60368b0021376082be8368758ad13300404 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 9 Nov 2022 18:36:04 -0500
Subject: [PATCH 15/25] Updated symmetry check to account for sparse

---
 sklearn/cluster/_hdbscan/hdbscan.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index bf58b10e3ccfd..937eac39db926 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -20,7 +20,7 @@
 from ...metrics._dist_metrics import DistanceMetric
 from ...neighbors import BallTree, KDTree, NearestNeighbors
 from ...utils._param_validation import Interval, StrOptions
-from ...utils.validation import _assert_all_finite
+from ...utils.validation import _assert_all_finite, _allclose_dense_sparse
 from ._linkage import label, mst_from_distance_matrix, mst_from_data_matrix
 from ._reachability import mutual_reachability_graph
 from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut
@@ -122,7 +122,7 @@ def _hdbscan_brute(
                 f" it has shape {X.shape}. Please verify that the"
                 " distance matrix was constructed correctly."
             )
-        if not np.allclose(X, X.T):
+        if not _allclose_dense_sparse(X, X.T):
             raise ValueError(
                 "The precomputed distance matrix is expected to be symmetric, however"
                 " its values appear to be asymmetric. Please verify that the distance"

From 2874340f467c1b8cccb65669f5252cb714161896 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 29 Nov 2022 18:03:59 -0500
Subject: [PATCH 16/25] Updated import order and used cython integral fused
 type

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index c83fef742e82e..0131c9ed1aca5 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -4,22 +4,17 @@
 #          Guillaume Lemaitre <g.lemaitre58@gmail.com>
 # License: 3-clause BSD
 
-import numpy as np
-from scipy.sparse import issparse
-
 cimport cython
-from cython cimport floating
 cimport numpy as cnp
+
+import numpy as np
+from scipy.sparse import issparse
+from cython cimport floating, integral
 from cython.parallel cimport prange
 from libc.math cimport isfinite, INFINITY
 
 cnp.import_array()
 
-ctypedef fused integral:
-    int
-    long long
-
-
 def mutual_reachability_graph(
     distance_matrix, min_samples=5, max_distance=0.0
 ):

From c47a8ba61cd751c012fdaf7db07615abd87b3890 Mon Sep 17 00:00:00 2001
From: Meekail Zain <34613774+Micky774@users.noreply.github.com>
Date: Wed, 30 Nov 2022 18:25:04 -0500
Subject: [PATCH 17/25] Update sklearn/cluster/_hdbscan/hdbscan.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 sklearn/cluster/_hdbscan/hdbscan.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 937eac39db926..74af44c9e88f5 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -358,8 +358,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
         If `copy=True` then any time an in-place modifications would be made
         that would overwrite data passed to :term:`fit`, a copy will first be
         made, guaranteeing that the original data will be unchanged.
-        Currently, it only applies with `metric="precomputed"`, passing a dense
-        array or a sparse matrix of format CSR and algorithm used is `"brute"`.
+        Currently, it only applies when `metric="precomputed"`, when passing
+        a dense array or a CSR sparse matrix and when `algorithm="brute"`.
 
     Attributes
     ----------

From 557975af8375922516f09c0a4dd01f30c7310598 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 30 Nov 2022 18:25:17 -0500
Subject: [PATCH 18/25] Optimized loops and formatted imports

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index 0131c9ed1aca5..efc641df29e19 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -99,18 +99,20 @@ def _dense_mutual_reachability_graph(
     cdef:
         cnp.intp_t i, j, n_samples = distance_matrix.shape[0]
         floating mutual_reachibility_distance
-        floating[:] core_distances
+        floating[::1] core_distances
 
     # We assume that the distance matrix is symmetric. We choose to sort every
     # row to have the same implementation than the sparse case that requires
     # CSR matrix.
-    core_distances = np.partition(
-        distance_matrix, further_neighbor_idx, axis=1
-    )[:, further_neighbor_idx]
+    core_distances = np.ascontiguousarray(
+        np.partition(
+            distance_matrix, further_neighbor_idx, axis=1
+        )[:, further_neighbor_idx]
+    )
 
     with nogil:
-        for i in range(n_samples):
-            for j in prange(n_samples):
+        for i in prange(n_samples):
+            for j in range(n_samples):
                 mutual_reachibility_distance = max(
                     core_distances[i],
                     core_distances[j],
@@ -118,7 +120,6 @@ def _dense_mutual_reachability_graph(
                 )
                 distance_matrix[i, j] = mutual_reachibility_distance
 
-
 def _sparse_mutual_reachability_graph(
     cnp.ndarray[floating, ndim=1, mode="c"] data,
     cnp.ndarray[integral, ndim=1, mode="c"] indices,

From f34c121385f8b610fc7ed9a7bba7de5ededdac5e Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 30 Nov 2022 18:56:31 -0500
Subject: [PATCH 19/25] Updated sparse check

---
 sklearn/cluster/_hdbscan/hdbscan.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 74af44c9e88f5..7ec7ad56cd177 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -13,7 +13,7 @@
 from warnings import warn
 
 import numpy as np
-from scipy.sparse import csgraph, issparse
+from scipy.sparse import csgraph, issparse, isspmatrix_csr
 
 from ...base import BaseEstimator, ClusterMixin
 from ...metrics import pairwise_distances
@@ -137,7 +137,7 @@ def _hdbscan_brute(
     distance_matrix /= alpha
 
     max_distance = metric_params.get("max_distance", 0.0)
-    if issparse(distance_matrix) and distance_matrix.format != "csr":
+    if isspmatrix_csr(distance_matrix):
         # we need CSR format to avoid a conversion in `_brute_mst` when calling
         # `csgraph.connected_components`
         distance_matrix = distance_matrix.tocsr()

From cc526317dc29cd7c92ff4fecb9c97e2a0b747b75 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Mon, 19 Dec 2022 13:00:24 -0500
Subject: [PATCH 20/25] Removed prange and added note

---
 sklearn/cluster/_hdbscan/_reachability.pyx    |   5 +-
 .../_argkminlabels.pyx                        | 679 +++++++++++++
 .../_engines.pxd                              | 347 +++++++
 .../_engines.pyx                              | 940 ++++++++++++++++++
 4 files changed, 1969 insertions(+), 2 deletions(-)
 create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx
 create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_engines.pxd
 create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_engines.pyx

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index efc641df29e19..dc4263694f89a 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -10,7 +10,6 @@ cimport numpy as cnp
 import numpy as np
 from scipy.sparse import issparse
 from cython cimport floating, integral
-from cython.parallel cimport prange
 from libc.math cimport isfinite, INFINITY
 
 cnp.import_array()
@@ -111,7 +110,9 @@ def _dense_mutual_reachability_graph(
     )
 
     with nogil:
-        for i in prange(n_samples):
+        # TODO: Update w/ prange with thread count based on
+        # _openmp_effective_n_threads
+        for i in range(n_samples):
             for j in range(n_samples):
                 mutual_reachibility_distance = max(
                     core_distances[i],
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx
new file mode 100644
index 0000000000000..32e0a4d6d0546
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx
@@ -0,0 +1,679 @@
+
+from cython cimport floating, integral
+from cython.parallel cimport parallel, prange
+from libcpp.map cimport map as cmap, pair
+from libc.stdlib cimport free
+
+cimport numpy as cnp
+
+cnp.import_array()
+
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+from ...utils._typedefs import ITYPE, DTYPE
+from ...utils._sorting cimport simultaneous_sort
+import numpy as np
+from scipy.sparse import issparse
+from sklearn.utils.fixes import threadpool_limits
+
+cpdef enum WeightingStrategy:
+    uniform = 0
+    distance = 1
+    other = 2
+from ._argkmin cimport ArgKmin64, EuclideanArgKmin64
+from ._datasets_pair cimport DatasetsPair64
+
+cdef class ArgKminLabels64(ArgKmin64):
+    """
+    64bit implementation of ArgKminLabel.
+    """
+    cdef:
+        const ITYPE_t[:] labels,
+        DTYPE_t[:, :] label_weights
+        cmap[ITYPE_t, ITYPE_t] labels_to_index
+        WeightingStrategy weight_type
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        ITYPE_t k,
+        weights,
+        labels,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+    ):
+        """Compute the argkmin reduction.
+
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`ArgKmin64`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+
+        No instance should directly be created outside of this class method.
+        """
+        if (
+            (
+                metric in ("euclidean", "sqeuclidean")
+                or metric=="minkowski" and (metric_kwargs is None or metric_kwargs.get("p", 2)==2)
+            )
+            and not (issparse(X) ^ issparse(Y))  # "^" is the XOR operator
+        ):
+            # Specialized implementation of ArgKminLabels for the Euclidean distance
+            # for the dense-dense and sparse-sparse cases.
+            # This implementation computes the distances by chunk using
+            # a decomposition of the Squared Euclidean distance.
+            # This specialisation has an improved arithmetic intensity for both
+            # the dense and sparse settings, allowing in most case speed-ups of
+            # several orders of magnitude compared to the generic ArgKmin
+            # implementation.
+            # For more information see MiddleTermComputer.
+            use_squared_distances = metric == "sqeuclidean"
+            pda = EuclideanArgKminLabels64(
+                X=X, Y=Y, k=k,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size,
+                strategy=strategy,
+                metric_kwargs=metric_kwargs,
+                weights=weights,
+                labels=labels,
+            )
+        else:
+            # Fall back on a generic implementation that handles most scipy
+            # metrics by computing the distances between 2 vectors at a time.
+            pda = ArgKminLabels64(
+                datasets_pair=DatasetsPair64.get_for(X, Y, metric, metric_kwargs),
+                k=k,
+                chunk_size=chunk_size,
+                strategy=strategy,
+                weights=weights,
+                labels=labels,
+            )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results()
+
+    def __init__(
+        self,
+        DatasetsPair64 datasets_pair,
+        const ITYPE_t[:] labels,
+        chunk_size=None,
+        strategy=None,
+        ITYPE_t k=1,
+        weights=None,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            k=k,
+        )
+
+        if weights == "uniform":
+            self.weight_type = WeightingStrategy.uniform
+        elif weights == "distance":
+            self.weight_type = WeightingStrategy.distance
+        else:
+            self.weight_type = WeightingStrategy.other
+        self.labels = labels
+
+        cdef ITYPE_t[:] unique_labels = np.unique(labels)
+
+        cdef ITYPE_t idx, label
+        # Map from set of unique labels to their indices in `label_weights`
+        for idx, label in enumerate(unique_labels):
+            self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx))
+
+        # Buffer used in building a histogram for one-pass weighted mode
+        self.label_weights = np.zeros((self.n_samples_X,  len(unique_labels)), dtype=DTYPE)
+
+    def _finalize_results(self):
+        probabilities = np.asarray(self.label_weights)
+        probabilities /= probabilities.sum(axis=1, keepdims=True)
+        return probabilities
+
+    cdef inline void weighted_histogram_mode(
+        self,
+        ITYPE_t sample_index,
+        ITYPE_t* indices,
+        DTYPE_t* distances,
+   ) nogil:
+        cdef:
+            ITYPE_t y_idx, label, label_index, multi_output_index
+            DTYPE_t label_weight = 1
+
+        # Iterate through the sample k-nearest neighbours
+        for jdx in range(self.k):
+            # Absolute indice of the jdx-th Nearest Neighbors
+            # in range [0, n_samples_Y)
+            if self.weight_type == WeightingStrategy.distance:
+                label_weight = 1 / distances[jdx]
+            y_idx = indices[jdx]
+            label = self.labels[y_idx]
+            label_index = self.labels_to_index[label]
+            self.label_weights[sample_index][label_index] += label_weight
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, sample_index
+        # Sorting the main heaps portion associated to `X[X_start:X_end]`
+        # in ascending order w.r.t the distances.
+        for idx in range(X_end - X_start):
+            simultaneous_sort(
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
+                self.k
+            )
+            # One-pass top-one weighted mode
+            # Compute the absolute index in [0, n_samples_X)
+            sample_index = X_start + idx
+            max_label_weight = -1
+            self.weighted_histogram_mode(
+                sample_index,
+                &self.heaps_indices_chunks[thread_num][0],
+                &self.heaps_r_distances_chunks[thread_num][0],
+            )
+        return
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t sample_index, thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            # Deallocating temporary datastructures
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sorting the main in ascending order w.r.t the distances.
+            # This is done in parallel sample-wise (no need for locks).
+            for sample_index in prange(self.n_samples_X, schedule='static'):
+                simultaneous_sort(
+                    &self.argkmin_distances[sample_index, 0],
+                    &self.argkmin_indices[sample_index, 0],
+                    self.k,
+                )
+                self.weighted_histogram_mode(
+                    sample_index,
+                    &self.argkmin_indices[sample_index][0],
+                    &self.argkmin_distances[sample_index][0],
+                )
+        return
+
+cdef class EuclideanArgKminLabels64(EuclideanArgKmin64):
+    """
+    64bit implementation of ArgKminLabel.
+    """
+    cdef:
+        const ITYPE_t[:] labels,
+        DTYPE_t[:, :] label_weights
+        cmap[ITYPE_t, ITYPE_t] labels_to_index
+        WeightingStrategy weight_type
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t k,
+        bint use_squared_distances=False,
+        chunk_size=None,
+        strategy=None,
+        metric_kwargs=None,
+        weights=None,
+        labels=None,
+    ):
+        super().__init__(
+            X=X, Y=Y, k=k,
+            use_squared_distances=use_squared_distances,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            metric_kwargs=metric_kwargs,
+        )
+        if weights == "uniform":
+            self.weight_type = WeightingStrategy.uniform
+        elif weights == "distance":
+            self.weight_type = WeightingStrategy.distance
+        else:
+            self.weight_type = WeightingStrategy.other
+        self.labels = labels
+
+        cdef ITYPE_t[:] unique_labels = np.unique(labels)
+
+        cdef ITYPE_t idx, label
+        # Map from set of unique labels to their indices in `label_weights`
+        for idx, label in enumerate(unique_labels):
+            self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx))
+
+        # Buffer used in building a histogram for one-pass weighted mode
+        self.label_weights = np.zeros((self.n_samples_X,  len(unique_labels)), dtype=DTYPE)
+
+    def _finalize_results(self):
+        probabilities = np.asarray(self.label_weights)
+        probabilities /= probabilities.sum(axis=1, keepdims=True)
+        return probabilities
+
+    cdef inline void weighted_histogram_mode(
+        self,
+        ITYPE_t sample_index,
+        ITYPE_t* indices,
+        DTYPE_t* distances,
+   ) nogil:
+        cdef:
+            ITYPE_t y_idx, label, label_index, multi_output_index
+            DTYPE_t label_weight = 1
+
+        # Iterate through the sample k-nearest neighbours
+        for jdx in range(self.k):
+            # Absolute indice of the jdx-th Nearest Neighbors
+            # in range [0, n_samples_Y)
+            if self.weight_type == WeightingStrategy.distance:
+                label_weight = 1 / distances[jdx]
+            y_idx = indices[jdx]
+            label = self.labels[y_idx]
+            label_index = self.labels_to_index[label]
+            self.label_weights[sample_index][label_index] += label_weight
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, sample_index
+        # Sorting the main heaps portion associated to `X[X_start:X_end]`
+        # in ascending order w.r.t the distances.
+        for idx in range(X_end - X_start):
+            simultaneous_sort(
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
+                self.k
+            )
+            # One-pass top-one weighted mode
+            # Compute the absolute index in [0, n_samples_X)
+            sample_index = X_start + idx
+            max_label_weight = -1
+            self.weighted_histogram_mode(
+                sample_index,
+                &self.heaps_indices_chunks[thread_num][0],
+                &self.heaps_r_distances_chunks[thread_num][0],
+            )
+        return
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t sample_index, thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            # Deallocating temporary datastructures
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sorting the main in ascending order w.r.t the distances.
+            # This is done in parallel sample-wise (no need for locks).
+            for sample_index in prange(self.n_samples_X, schedule='static'):
+                simultaneous_sort(
+                    &self.argkmin_distances[sample_index, 0],
+                    &self.argkmin_indices[sample_index, 0],
+                    self.k,
+                )
+                self.weighted_histogram_mode(
+                    sample_index,
+                    &self.argkmin_indices[sample_index][0],
+                    &self.argkmin_distances[sample_index][0],
+                )
+        return
+from ._argkmin cimport ArgKmin32, EuclideanArgKmin32
+from ._datasets_pair cimport DatasetsPair32
+
+cdef class ArgKminLabels32(ArgKmin32):
+    """
+    32bit implementation of ArgKminLabel.
+    """
+    cdef:
+        const ITYPE_t[:] labels,
+        DTYPE_t[:, :] label_weights
+        cmap[ITYPE_t, ITYPE_t] labels_to_index
+        WeightingStrategy weight_type
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        ITYPE_t k,
+        weights,
+        labels,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+    ):
+        """Compute the argkmin reduction.
+
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`ArgKmin32`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+
+        No instance should directly be created outside of this class method.
+        """
+        if (
+            (
+                metric in ("euclidean", "sqeuclidean")
+                or metric=="minkowski" and (metric_kwargs is None or metric_kwargs.get("p", 2)==2)
+            )
+            and not (issparse(X) ^ issparse(Y))  # "^" is the XOR operator
+        ):
+            # Specialized implementation of ArgKminLabels for the Euclidean distance
+            # for the dense-dense and sparse-sparse cases.
+            # This implementation computes the distances by chunk using
+            # a decomposition of the Squared Euclidean distance.
+            # This specialisation has an improved arithmetic intensity for both
+            # the dense and sparse settings, allowing in most case speed-ups of
+            # several orders of magnitude compared to the generic ArgKmin
+            # implementation.
+            # For more information see MiddleTermComputer.
+            use_squared_distances = metric == "sqeuclidean"
+            pda = EuclideanArgKminLabels32(
+                X=X, Y=Y, k=k,
+                use_squared_distances=use_squared_distances,
+                chunk_size=chunk_size,
+                strategy=strategy,
+                metric_kwargs=metric_kwargs,
+                weights=weights,
+                labels=labels,
+            )
+        else:
+            # Fall back on a generic implementation that handles most scipy
+            # metrics by computing the distances between 2 vectors at a time.
+            pda = ArgKminLabels32(
+                datasets_pair=DatasetsPair32.get_for(X, Y, metric, metric_kwargs),
+                k=k,
+                chunk_size=chunk_size,
+                strategy=strategy,
+                weights=weights,
+                labels=labels,
+            )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results()
+
+    def __init__(
+        self,
+        DatasetsPair32 datasets_pair,
+        const ITYPE_t[:] labels,
+        chunk_size=None,
+        strategy=None,
+        ITYPE_t k=1,
+        weights=None,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            k=k,
+        )
+
+        if weights == "uniform":
+            self.weight_type = WeightingStrategy.uniform
+        elif weights == "distance":
+            self.weight_type = WeightingStrategy.distance
+        else:
+            self.weight_type = WeightingStrategy.other
+        self.labels = labels
+
+        cdef ITYPE_t[:] unique_labels = np.unique(labels)
+
+        cdef ITYPE_t idx, label
+        # Map from set of unique labels to their indices in `label_weights`
+        for idx, label in enumerate(unique_labels):
+            self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx))
+
+        # Buffer used in building a histogram for one-pass weighted mode
+        self.label_weights = np.zeros((self.n_samples_X,  len(unique_labels)), dtype=DTYPE)
+
+    def _finalize_results(self):
+        probabilities = np.asarray(self.label_weights)
+        probabilities /= probabilities.sum(axis=1, keepdims=True)
+        return probabilities
+
+    cdef inline void weighted_histogram_mode(
+        self,
+        ITYPE_t sample_index,
+        ITYPE_t* indices,
+        DTYPE_t* distances,
+   ) nogil:
+        cdef:
+            ITYPE_t y_idx, label, label_index, multi_output_index
+            DTYPE_t label_weight = 1
+
+        # Iterate through the sample k-nearest neighbours
+        for jdx in range(self.k):
+            # Absolute indice of the jdx-th Nearest Neighbors
+            # in range [0, n_samples_Y)
+            if self.weight_type == WeightingStrategy.distance:
+                label_weight = 1 / distances[jdx]
+            y_idx = indices[jdx]
+            label = self.labels[y_idx]
+            label_index = self.labels_to_index[label]
+            self.label_weights[sample_index][label_index] += label_weight
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, sample_index
+        # Sorting the main heaps portion associated to `X[X_start:X_end]`
+        # in ascending order w.r.t the distances.
+        for idx in range(X_end - X_start):
+            simultaneous_sort(
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
+                self.k
+            )
+            # One-pass top-one weighted mode
+            # Compute the absolute index in [0, n_samples_X)
+            sample_index = X_start + idx
+            max_label_weight = -1
+            self.weighted_histogram_mode(
+                sample_index,
+                &self.heaps_indices_chunks[thread_num][0],
+                &self.heaps_r_distances_chunks[thread_num][0],
+            )
+        return
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t sample_index, thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            # Deallocating temporary datastructures
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sorting the main in ascending order w.r.t the distances.
+            # This is done in parallel sample-wise (no need for locks).
+            for sample_index in prange(self.n_samples_X, schedule='static'):
+                simultaneous_sort(
+                    &self.argkmin_distances[sample_index, 0],
+                    &self.argkmin_indices[sample_index, 0],
+                    self.k,
+                )
+                self.weighted_histogram_mode(
+                    sample_index,
+                    &self.argkmin_indices[sample_index][0],
+                    &self.argkmin_distances[sample_index][0],
+                )
+        return
+
+cdef class EuclideanArgKminLabels32(EuclideanArgKmin32):
+    """
+    32bit implementation of ArgKminLabel.
+    """
+    cdef:
+        const ITYPE_t[:] labels,
+        DTYPE_t[:, :] label_weights
+        cmap[ITYPE_t, ITYPE_t] labels_to_index
+        WeightingStrategy weight_type
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t k,
+        bint use_squared_distances=False,
+        chunk_size=None,
+        strategy=None,
+        metric_kwargs=None,
+        weights=None,
+        labels=None,
+    ):
+        super().__init__(
+            X=X, Y=Y, k=k,
+            use_squared_distances=use_squared_distances,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            metric_kwargs=metric_kwargs,
+        )
+        if weights == "uniform":
+            self.weight_type = WeightingStrategy.uniform
+        elif weights == "distance":
+            self.weight_type = WeightingStrategy.distance
+        else:
+            self.weight_type = WeightingStrategy.other
+        self.labels = labels
+
+        cdef ITYPE_t[:] unique_labels = np.unique(labels)
+
+        cdef ITYPE_t idx, label
+        # Map from set of unique labels to their indices in `label_weights`
+        for idx, label in enumerate(unique_labels):
+            self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx))
+
+        # Buffer used in building a histogram for one-pass weighted mode
+        self.label_weights = np.zeros((self.n_samples_X,  len(unique_labels)), dtype=DTYPE)
+
+    def _finalize_results(self):
+        probabilities = np.asarray(self.label_weights)
+        probabilities /= probabilities.sum(axis=1, keepdims=True)
+        return probabilities
+
+    cdef inline void weighted_histogram_mode(
+        self,
+        ITYPE_t sample_index,
+        ITYPE_t* indices,
+        DTYPE_t* distances,
+   ) nogil:
+        cdef:
+            ITYPE_t y_idx, label, label_index, multi_output_index
+            DTYPE_t label_weight = 1
+
+        # Iterate through the sample k-nearest neighbours
+        for jdx in range(self.k):
+            # Absolute indice of the jdx-th Nearest Neighbors
+            # in range [0, n_samples_Y)
+            if self.weight_type == WeightingStrategy.distance:
+                label_weight = 1 / distances[jdx]
+            y_idx = indices[jdx]
+            label = self.labels[y_idx]
+            label_index = self.labels_to_index[label]
+            self.label_weights[sample_index][label_index] += label_weight
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, sample_index
+        # Sorting the main heaps portion associated to `X[X_start:X_end]`
+        # in ascending order w.r.t the distances.
+        for idx in range(X_end - X_start):
+            simultaneous_sort(
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
+                self.k
+            )
+            # One-pass top-one weighted mode
+            # Compute the absolute index in [0, n_samples_X)
+            sample_index = X_start + idx
+            max_label_weight = -1
+            self.weighted_histogram_mode(
+                sample_index,
+                &self.heaps_indices_chunks[thread_num][0],
+                &self.heaps_r_distances_chunks[thread_num][0],
+            )
+        return
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t sample_index, thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            # Deallocating temporary datastructures
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sorting the main in ascending order w.r.t the distances.
+            # This is done in parallel sample-wise (no need for locks).
+            for sample_index in prange(self.n_samples_X, schedule='static'):
+                simultaneous_sort(
+                    &self.argkmin_distances[sample_index, 0],
+                    &self.argkmin_indices[sample_index, 0],
+                    self.k,
+                )
+                self.weighted_histogram_mode(
+                    sample_index,
+                    &self.argkmin_indices[sample_index][0],
+                    &self.argkmin_distances[sample_index][0],
+                )
+        return
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd b/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd
new file mode 100644
index 0000000000000..33023cdb2a400
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd
@@ -0,0 +1,347 @@
+cimport numpy as cnp
+
+from libcpp.vector cimport vector
+
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
+
+
+cdef void _middle_term_sparse_sparse_64(
+    const DTYPE_t[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indices,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const DTYPE_t[:] Y_data,
+    const SPARSE_INDEX_TYPE_t[:] Y_indices,
+    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    DTYPE_t * D,
+) nogil
+
+cdef class BaseEngine:
+
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_Y_synchronize(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+cdef class EuclideanEngine64(BaseEngine):
+    cdef:
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+        ITYPE_t dist_middle_terms_chunks_size
+        ITYPE_t n_features
+        ITYPE_t chunk_size
+        DTYPE_t[::1] X_norm_squared, Y_norm_squared
+
+        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
+        vector[vector[DTYPE_t]] dist_middle_terms_chunks
+
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_init(self) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef DTYPE_t _compute_pair_distance(
+        self,
+        ITYPE_t i, # Index of X sample
+        ITYPE_t j, # Index of Y sample
+        ITYPE_t X_start, # Index offset
+        ITYPE_t Y_start, # Index offset
+        DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms
+    ) nogil
+
+cdef class DenseDenseEuclideanEngine64(EuclideanEngine64):
+    cdef:
+        const DTYPE_t[:, ::1] X
+        const DTYPE_t[:, ::1] Y
+
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+
+cdef class SparseSparseEuclideanEngine64(EuclideanEngine64):
+    cdef:
+        const DTYPE_t[:] X_data
+        const SPARSE_INDEX_TYPE_t[:] X_indices
+        const SPARSE_INDEX_TYPE_t[:] X_indptr
+
+        const DTYPE_t[:] Y_data
+        const SPARSE_INDEX_TYPE_t[:] Y_indices
+        const SPARSE_INDEX_TYPE_t[:] Y_indptr
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+cdef class EuclideanEngine32(BaseEngine):
+    cdef:
+        ITYPE_t effective_n_threads
+        ITYPE_t chunks_n_threads
+        ITYPE_t dist_middle_terms_chunks_size
+        ITYPE_t n_features
+        ITYPE_t chunk_size
+        DTYPE_t[::1] X_norm_squared, Y_norm_squared
+
+        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
+        vector[vector[DTYPE_t]] dist_middle_terms_chunks
+
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_init(self) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef DTYPE_t _compute_pair_distance(
+        self,
+        ITYPE_t i, # Index of X sample
+        ITYPE_t j, # Index of Y sample
+        ITYPE_t X_start, # Index offset
+        ITYPE_t Y_start, # Index offset
+        DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms
+    ) nogil
+
+cdef class DenseDenseEuclideanEngine32(EuclideanEngine32):
+    cdef:
+        const cnp.float32_t[:, ::1] X
+        const cnp.float32_t[:, ::1] Y
+
+        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
+        vector[vector[DTYPE_t]] X_c_upcast
+        vector[vector[DTYPE_t]] Y_c_upcast
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
+
+cdef class SparseSparseEuclideanEngine32(EuclideanEngine32):
+    cdef:
+        const DTYPE_t[:] X_data
+        const SPARSE_INDEX_TYPE_t[:] X_indices
+        const SPARSE_INDEX_TYPE_t[:] X_indptr
+
+        const DTYPE_t[:] Y_data
+        const SPARSE_INDEX_TYPE_t[:] Y_indices
+        const SPARSE_INDEX_TYPE_t[:] Y_indptr
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx b/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx
new file mode 100644
index 0000000000000..5e1fe8cb457b3
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx
@@ -0,0 +1,940 @@
+cimport numpy as cnp
+
+from libcpp.vector cimport vector
+
+from ...utils._cython_blas cimport (
+  BLAS_Order,
+  BLAS_Trans,
+  NoTrans,
+  RowMajor,
+  Trans,
+  _gemm,
+)
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
+
+# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used
+# Introduction in Cython:
+#
+# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa
+cdef extern from "<algorithm>" namespace "std" nogil:
+    void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa
+
+import numpy as np
+from scipy.sparse import issparse, csr_matrix
+from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
+from ...utils import check_array
+
+cdef class BaseEngine:
+    def __init__(self):
+        return
+
+    cdef void _parallel_on_X_parallel_init(
+        self,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_parallel_finalize(
+        self,
+        ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_init(
+        self,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_synchronize(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+# TODO: If possible optimize this routine to efficiently treat cases where
+# `n_samples_X << n_samples_Y` met in practise when X_test consists of a
+# few samples, and thus when there's a single chunk of X whose number of
+# samples is less that the default chunk size.
+
+# TODO: compare this routine with the similar ones in SciPy, especially
+# `csr_matmat` which might implement a better algorithm.
+# See: https://github.com/scipy/scipy/blob/e58292e066ba2cb2f3d1e0563ca9314ff1f4f311/scipy/sparse/sparsetools/csr.h#L603-L669  # noqa
+cdef void _middle_term_sparse_sparse_64(
+    const DTYPE_t[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indices,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const DTYPE_t[:] Y_data,
+    const SPARSE_INDEX_TYPE_t[:] Y_indices,
+    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    DTYPE_t * D,
+) nogil:
+    # This routine assumes that D points to the first element of a
+    # zeroed buffer of length at least equal to n_X × n_Y, conceptually
+    # representing a 2-d C-ordered array.
+    cdef:
+        ITYPE_t i, j, k
+        ITYPE_t n_X = X_end - X_start
+        ITYPE_t n_Y = Y_end - Y_start
+        ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
+
+    for i in range(n_X):
+        for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
+            X_i_col_idx = X_indices[X_i_ptr]
+            for j in range(n_Y):
+                k = i * n_Y + j
+                for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
+                    Y_j_col_idx = Y_indices[Y_j_ptr]
+                    if X_i_col_idx == Y_j_col_idx:
+                        D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr]
+
+
+from ._base cimport _sqeuclidean_row_norms64
+
+cdef class EuclideanEngine64(BaseEngine):
+    """Helper class to compute a Euclidean distance matrix in chunks.
+
+    This is an abstract base class that is further specialized depending
+    on the type of data (dense or sparse).
+
+    `EuclideanDistance` subclasses relies on the squared Euclidean
+    distances between chunks of vectors X_c and Y_c using the
+    following decomposition for the (i,j) pair :
+
+
+         ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+
+
+    This helper class is in charge of wrapping the common logic to compute
+    the middle term, i.e. `- 2 X_c_i.Y_c_j^T`.
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        pda,
+    ) -> EuclideanEngine64:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+
+        Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+
+        Returns
+        -------
+        engine: EuclideanEngine64
+            The suited EuclideanEngine64 implementation.
+        """
+        X_is_sparse = issparse(X)
+        Y_is_sparse = issparse(Y)
+        dist_middle_terms_chunks_size = pda.Y_n_samples_chunk * pda.X_n_samples_chunk
+        if not X_is_sparse and not Y_is_sparse:
+            return DenseDenseEuclideanEngine64(
+                X,
+                Y,
+                effective_n_threads=pda.effective_n_threads,
+                chunks_n_threads=pda.chunks_n_threads,
+                dist_middle_terms_chunks_size=dist_middle_terms_chunks_size,
+                chunk_size=pda.chunk_size,
+                metric_kwargs=pda.metric_kwargs,
+            )
+        if X_is_sparse and Y_is_sparse:
+            return SparseSparseEuclideanEngine64(
+                X,
+                Y,
+                effective_n_threads=pda.effective_n_threads,
+                chunks_n_threads=pda.chunks_n_threads,
+                dist_middle_terms_chunks_size=dist_middle_terms_chunks_size,
+                chunk_size=pda.chunk_size,
+                metric_kwargs=pda.metric_kwargs,
+            )
+
+        raise NotImplementedError(
+            "X and Y must be both CSR sparse matrices or both numpy arrays."
+        )
+
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
+        X_data = np.asarray(X.data, dtype=DTYPE)
+        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
+        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
+        return X_data, X_indices, X_indptr
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t chunk_size,
+        dict metric_kwargs=None,
+    ):
+        self.effective_n_threads = effective_n_threads
+        self.chunks_n_threads = chunks_n_threads
+        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
+        self.n_features = X.shape[1]
+        self.chunk_size = chunk_size
+
+        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
+
+        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
+            self.Y_norm_squared = check_array(
+                metric_kwargs.pop("Y_norm_squared"),
+                ensure_2d=False,
+                input_name="Y_norm_squared",
+                dtype=np.float64,
+            )
+        else:
+            self.Y_norm_squared = _sqeuclidean_row_norms64(
+                Y,
+                self.effective_n_threads,
+            )
+
+        if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
+            self.X_norm_squared = check_array(
+                metric_kwargs.pop("X_norm_squared"),
+                ensure_2d=False,
+                input_name="X_norm_squared",
+                dtype=np.float64,
+            )
+        else:
+            # Do not recompute norms if datasets are identical.
+            self.X_norm_squared = (
+                self.Y_norm_squared if X is Y else
+                _sqeuclidean_row_norms64(
+                    X,
+                    self.effective_n_threads,
+                )
+            )
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
+        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_init(self) nogil:
+        for thread_num in range(self.chunks_n_threads):
+            self.dist_middle_terms_chunks[thread_num].resize(
+                self.dist_middle_terms_chunks_size
+            )
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return NULL
+
+    cdef DTYPE_t _compute_pair_distance(
+        self,
+        ITYPE_t i, # Index of X sample
+        ITYPE_t j, # Index of Y sample
+        ITYPE_t X_start, # Index offset
+        ITYPE_t Y_start, # Index offset
+        DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms
+    ) nogil:
+
+        cdef ITYPE_t n_Y = len(self.Y_norm_squared)
+        # Index of middle term
+        cdef ITYPE_t k = n_Y * i + j
+        cdef DTYPE_t val = (
+            self.X_norm_squared[i + X_start] +
+            dist_middle_terms[i * n_Y + j] +
+            self.Y_norm_squared[j + Y_start]
+        )
+        # Catastrophic cancellation might cause -0. to be present,
+        # e.g. when computing d(x_i, y_i) when X is Y.
+        return max(0., val)
+
+
+cdef class DenseDenseEuclideanEngine64(EuclideanEngine64):
+    """Computes the middle term of the Euclidean distance between two chunked dense matrices
+    X_c and Y_c.
+
+                        dist_middle_terms = - 2 X_c_i.Y_c_j^T
+
+    This class use the BLAS gemm routine to perform the dot product of each chunks
+    of the distance matrix with improved arithmetic intensity and vector instruction (SIMD).
+    """
+
+    def __init__(
+        self,
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+        dict metric_kwargs=None,
+    ):
+        super().__init__(
+            X, Y,
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+            metric_kwargs=None,
+        )
+        self.X = X
+        self.Y = Y
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
+
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_end - X_start
+            ITYPE_t n = Y_end - Y_start
+            ITYPE_t K = self.n_features
+            DTYPE_t alpha = - 2.
+            # Casting for A and B to remove the const is needed because APIs exposed via
+            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
+            # See: https://github.com/scipy/scipy/issues/14262
+            DTYPE_t * A = <DTYPE_t *> &self.X[X_start, 0]
+            DTYPE_t * B = <DTYPE_t *> &self.Y[Y_start, 0]
+            ITYPE_t lda = self.n_features
+            ITYPE_t ldb = self.n_features
+            DTYPE_t beta = 0.
+            ITYPE_t ldc = Y_end - Y_start
+
+        # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T`
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
+
+        return dist_middle_terms
+
+
+cdef class SparseSparseEuclideanEngine64(EuclideanEngine64):
+    """Middle term of the Euclidean distance between two chunked CSR matrices.
+
+    The result is return as a contiguous array.
+
+            dist_middle_terms = - 2 X_c_i.Y_c_j^T
+
+    The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64.
+    This routine iterates over the data, indices and indptr arrays of the sparse matrices without
+    densifying them.
+    """
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        super().__init__(
+            X, Y,
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+            metric_kwargs=None,
+        )
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            DTYPE_t *dist_middle_terms = (
+                self.dist_middle_terms_chunks[thread_num].data()
+            )
+
+        _middle_term_sparse_sparse_64(
+            self.X_data,
+            self.X_indices,
+            self.X_indptr,
+            X_start,
+            X_end,
+            self.Y_data,
+            self.Y_indices,
+            self.Y_indptr,
+            Y_start,
+            Y_end,
+            dist_middle_terms,
+        )
+
+        return dist_middle_terms
+
+from ._base cimport _sqeuclidean_row_norms32
+
+cdef class EuclideanEngine32(BaseEngine):
+    """Helper class to compute a Euclidean distance matrix in chunks.
+
+    This is an abstract base class that is further specialized depending
+    on the type of data (dense or sparse).
+
+    `EuclideanDistance` subclasses relies on the squared Euclidean
+    distances between chunks of vectors X_c and Y_c using the
+    following decomposition for the (i,j) pair :
+
+
+         ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+
+
+    This helper class is in charge of wrapping the common logic to compute
+    the middle term, i.e. `- 2 X_c_i.Y_c_j^T`.
+    """
+
+    @classmethod
+    def get_for(
+        cls,
+        X,
+        Y,
+        pda,
+    ) -> EuclideanEngine32:
+        """Return the DatasetsPair implementation for the given arguments.
+
+        Parameters
+        ----------
+        X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+
+        Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features)
+            Input data.
+            If provided as a ndarray, it must be C-contiguous.
+
+        Returns
+        -------
+        engine: EuclideanEngine32
+            The suited EuclideanEngine32 implementation.
+        """
+        X_is_sparse = issparse(X)
+        Y_is_sparse = issparse(Y)
+        dist_middle_terms_chunks_size = pda.Y_n_samples_chunk * pda.X_n_samples_chunk
+        if not X_is_sparse and not Y_is_sparse:
+            return DenseDenseEuclideanEngine32(
+                X,
+                Y,
+                effective_n_threads=pda.effective_n_threads,
+                chunks_n_threads=pda.chunks_n_threads,
+                dist_middle_terms_chunks_size=dist_middle_terms_chunks_size,
+                chunk_size=pda.chunk_size,
+                metric_kwargs=pda.metric_kwargs,
+            )
+        if X_is_sparse and Y_is_sparse:
+            return SparseSparseEuclideanEngine32(
+                X,
+                Y,
+                effective_n_threads=pda.effective_n_threads,
+                chunks_n_threads=pda.chunks_n_threads,
+                dist_middle_terms_chunks_size=dist_middle_terms_chunks_size,
+                chunk_size=pda.chunk_size,
+                metric_kwargs=pda.metric_kwargs,
+            )
+
+        raise NotImplementedError(
+            "X and Y must be both CSR sparse matrices or both numpy arrays."
+        )
+
+
+    @classmethod
+    def unpack_csr_matrix(cls, X: csr_matrix):
+        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
+        X_data = np.asarray(X.data, dtype=DTYPE)
+        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
+        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
+        return X_data, X_indices, X_indptr
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t chunk_size,
+        dict metric_kwargs=None,
+    ):
+        self.effective_n_threads = effective_n_threads
+        self.chunks_n_threads = chunks_n_threads
+        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
+        self.n_features = X.shape[1]
+        self.chunk_size = chunk_size
+
+        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
+
+        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
+            self.Y_norm_squared = check_array(
+                metric_kwargs.pop("Y_norm_squared"),
+                ensure_2d=False,
+                input_name="Y_norm_squared",
+                dtype=np.float64,
+            )
+        else:
+            self.Y_norm_squared = _sqeuclidean_row_norms32(
+                Y,
+                self.effective_n_threads,
+            )
+
+        if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
+            self.X_norm_squared = check_array(
+                metric_kwargs.pop("X_norm_squared"),
+                ensure_2d=False,
+                input_name="X_norm_squared",
+                dtype=np.float64,
+            )
+        else:
+            # Do not recompute norms if datasets are identical.
+            self.X_norm_squared = (
+                self.Y_norm_squared if X is Y else
+                _sqeuclidean_row_norms32(
+                    X,
+                    self.effective_n_threads,
+                )
+            )
+
+    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
+        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y_init(self) nogil:
+        for thread_num in range(self.chunks_n_threads):
+            self.dist_middle_terms_chunks[thread_num].resize(
+                self.dist_middle_terms_chunks_size
+            )
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        return NULL
+
+    cdef DTYPE_t _compute_pair_distance(
+        self,
+        ITYPE_t i, # Index of X sample
+        ITYPE_t j, # Index of Y sample
+        ITYPE_t X_start, # Index offset
+        ITYPE_t Y_start, # Index offset
+        DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms
+    ) nogil:
+
+        cdef ITYPE_t n_Y = len(self.Y_norm_squared)
+        # Index of middle term
+        cdef ITYPE_t k = n_Y * i + j
+        cdef DTYPE_t val = (
+            self.X_norm_squared[i + X_start] +
+            dist_middle_terms[i * n_Y + j] +
+            self.Y_norm_squared[j + Y_start]
+        )
+        # Catastrophic cancellation might cause -0. to be present,
+        # e.g. when computing d(x_i, y_i) when X is Y.
+        return max(0., val)
+
+
+cdef class DenseDenseEuclideanEngine32(EuclideanEngine32):
+    """Computes the middle term of the Euclidean distance between two chunked dense matrices
+    X_c and Y_c.
+
+                        dist_middle_terms = - 2 X_c_i.Y_c_j^T
+
+    This class use the BLAS gemm routine to perform the dot product of each chunks
+    of the distance matrix with improved arithmetic intensity and vector instruction (SIMD).
+    """
+
+    def __init__(
+        self,
+        const cnp.float32_t[:, ::1] X,
+        const cnp.float32_t[:, ::1] Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+        dict metric_kwargs=None,
+    ):
+        super().__init__(
+            X, Y,
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+            metric_kwargs=None,
+        )
+        self.X = X
+        self.Y = Y
+        # We populate the buffer for upcasting chunks of X and Y from float32 to float64.
+        self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
+        self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
+
+        upcast_buffer_n_elements = self.chunk_size * n_features
+
+        for thread_num in range(self.effective_n_threads):
+            self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+            self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+
+    cdef void _parallel_on_X_init_chunk(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+
+    cdef void _parallel_on_Y_parallel_init(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = X_end - X_start
+
+        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil:
+        cdef:
+            ITYPE_t i, j
+            ITYPE_t n_chunk_samples = Y_end - Y_start
+
+        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
+        for i in range(n_chunk_samples):
+            for j in range(self.n_features):
+                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
+
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays
+            # in BLAS documentations, for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_end - X_start
+            ITYPE_t n = Y_end - Y_start
+            ITYPE_t K = self.n_features
+            DTYPE_t alpha = - 2.
+            DTYPE_t * A = self.X_c_upcast[thread_num].data()
+            DTYPE_t * B = self.Y_c_upcast[thread_num].data()
+            ITYPE_t lda = self.n_features
+            ITYPE_t ldb = self.n_features
+            DTYPE_t beta = 0.
+            ITYPE_t ldc = Y_end - Y_start
+
+        # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T`
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
+
+        return dist_middle_terms
+
+
+cdef class SparseSparseEuclideanEngine32(EuclideanEngine32):
+    """Middle term of the Euclidean distance between two chunked CSR matrices.
+
+    The result is return as a contiguous array.
+
+            dist_middle_terms = - 2 X_c_i.Y_c_j^T
+
+    The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64.
+    This routine iterates over the data, indices and indptr arrays of the sparse matrices without
+    densifying them.
+    """
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+    ):
+        super().__init__(
+            X, Y,
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+            metric_kwargs=None,
+        )
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            DTYPE_t *dist_middle_terms = (
+                self.dist_middle_terms_chunks[thread_num].data()
+            )
+
+        _middle_term_sparse_sparse_64(
+            self.X_data,
+            self.X_indices,
+            self.X_indptr,
+            X_start,
+            X_end,
+            self.Y_data,
+            self.Y_indices,
+            self.Y_indptr,
+            Y_start,
+            Y_end,
+            dist_middle_terms,
+        )
+
+        return dist_middle_terms

From 7eff5ba01e72b209cd3bfb0b2c14b28c9be778cb Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Mon, 19 Dec 2022 13:06:13 -0500
Subject: [PATCH 21/25] Revert "Removed prange and added note"

This reverts commit cc526317dc29cd7c92ff4fecb9c97e2a0b747b75.
---
 sklearn/cluster/_hdbscan/_reachability.pyx    |   5 +-
 .../_argkminlabels.pyx                        | 679 -------------
 .../_engines.pxd                              | 347 -------
 .../_engines.pyx                              | 940 ------------------
 4 files changed, 2 insertions(+), 1969 deletions(-)
 delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx
 delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_engines.pxd
 delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_engines.pyx

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index dc4263694f89a..efc641df29e19 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -10,6 +10,7 @@ cimport numpy as cnp
 import numpy as np
 from scipy.sparse import issparse
 from cython cimport floating, integral
+from cython.parallel cimport prange
 from libc.math cimport isfinite, INFINITY
 
 cnp.import_array()
@@ -110,9 +111,7 @@ def _dense_mutual_reachability_graph(
     )
 
     with nogil:
-        # TODO: Update w/ prange with thread count based on
-        # _openmp_effective_n_threads
-        for i in range(n_samples):
+        for i in prange(n_samples):
             for j in range(n_samples):
                 mutual_reachibility_distance = max(
                     core_distances[i],
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx
deleted file mode 100644
index 32e0a4d6d0546..0000000000000
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx
+++ /dev/null
@@ -1,679 +0,0 @@
-
-from cython cimport floating, integral
-from cython.parallel cimport parallel, prange
-from libcpp.map cimport map as cmap, pair
-from libc.stdlib cimport free
-
-cimport numpy as cnp
-
-cnp.import_array()
-
-from ...utils._typedefs cimport ITYPE_t, DTYPE_t
-from ...utils._typedefs import ITYPE, DTYPE
-from ...utils._sorting cimport simultaneous_sort
-import numpy as np
-from scipy.sparse import issparse
-from sklearn.utils.fixes import threadpool_limits
-
-cpdef enum WeightingStrategy:
-    uniform = 0
-    distance = 1
-    other = 2
-from ._argkmin cimport ArgKmin64, EuclideanArgKmin64
-from ._datasets_pair cimport DatasetsPair64
-
-cdef class ArgKminLabels64(ArgKmin64):
-    """
-    64bit implementation of ArgKminLabel.
-    """
-    cdef:
-        const ITYPE_t[:] labels,
-        DTYPE_t[:, :] label_weights
-        cmap[ITYPE_t, ITYPE_t] labels_to_index
-        WeightingStrategy weight_type
-
-    @classmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        ITYPE_t k,
-        weights,
-        labels,
-        str metric="euclidean",
-        chunk_size=None,
-        dict metric_kwargs=None,
-        str strategy=None,
-    ):
-        """Compute the argkmin reduction.
-
-        This classmethod is responsible for introspecting the arguments
-        values to dispatch to the most appropriate implementation of
-        :class:`ArgKmin64`.
-
-        This allows decoupling the API entirely from the implementation details
-        whilst maintaining RAII: all temporarily allocated datastructures necessary
-        for the concrete implementation are therefore freed when this classmethod
-        returns.
-
-        No instance should directly be created outside of this class method.
-        """
-        if (
-            (
-                metric in ("euclidean", "sqeuclidean")
-                or metric=="minkowski" and (metric_kwargs is None or metric_kwargs.get("p", 2)==2)
-            )
-            and not (issparse(X) ^ issparse(Y))  # "^" is the XOR operator
-        ):
-            # Specialized implementation of ArgKminLabels for the Euclidean distance
-            # for the dense-dense and sparse-sparse cases.
-            # This implementation computes the distances by chunk using
-            # a decomposition of the Squared Euclidean distance.
-            # This specialisation has an improved arithmetic intensity for both
-            # the dense and sparse settings, allowing in most case speed-ups of
-            # several orders of magnitude compared to the generic ArgKmin
-            # implementation.
-            # For more information see MiddleTermComputer.
-            use_squared_distances = metric == "sqeuclidean"
-            pda = EuclideanArgKminLabels64(
-                X=X, Y=Y, k=k,
-                use_squared_distances=use_squared_distances,
-                chunk_size=chunk_size,
-                strategy=strategy,
-                metric_kwargs=metric_kwargs,
-                weights=weights,
-                labels=labels,
-            )
-        else:
-            # Fall back on a generic implementation that handles most scipy
-            # metrics by computing the distances between 2 vectors at a time.
-            pda = ArgKminLabels64(
-                datasets_pair=DatasetsPair64.get_for(X, Y, metric, metric_kwargs),
-                k=k,
-                chunk_size=chunk_size,
-                strategy=strategy,
-                weights=weights,
-                labels=labels,
-            )
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if pda.execute_in_parallel_on_Y:
-                pda._parallel_on_Y()
-            else:
-                pda._parallel_on_X()
-
-        return pda._finalize_results()
-
-    def __init__(
-        self,
-        DatasetsPair64 datasets_pair,
-        const ITYPE_t[:] labels,
-        chunk_size=None,
-        strategy=None,
-        ITYPE_t k=1,
-        weights=None,
-    ):
-        super().__init__(
-            datasets_pair=datasets_pair,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            k=k,
-        )
-
-        if weights == "uniform":
-            self.weight_type = WeightingStrategy.uniform
-        elif weights == "distance":
-            self.weight_type = WeightingStrategy.distance
-        else:
-            self.weight_type = WeightingStrategy.other
-        self.labels = labels
-
-        cdef ITYPE_t[:] unique_labels = np.unique(labels)
-
-        cdef ITYPE_t idx, label
-        # Map from set of unique labels to their indices in `label_weights`
-        for idx, label in enumerate(unique_labels):
-            self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx))
-
-        # Buffer used in building a histogram for one-pass weighted mode
-        self.label_weights = np.zeros((self.n_samples_X,  len(unique_labels)), dtype=DTYPE)
-
-    def _finalize_results(self):
-        probabilities = np.asarray(self.label_weights)
-        probabilities /= probabilities.sum(axis=1, keepdims=True)
-        return probabilities
-
-    cdef inline void weighted_histogram_mode(
-        self,
-        ITYPE_t sample_index,
-        ITYPE_t* indices,
-        DTYPE_t* distances,
-   ) nogil:
-        cdef:
-            ITYPE_t y_idx, label, label_index, multi_output_index
-            DTYPE_t label_weight = 1
-
-        # Iterate through the sample k-nearest neighbours
-        for jdx in range(self.k):
-            # Absolute indice of the jdx-th Nearest Neighbors
-            # in range [0, n_samples_Y)
-            if self.weight_type == WeightingStrategy.distance:
-                label_weight = 1 / distances[jdx]
-            y_idx = indices[jdx]
-            label = self.labels[y_idx]
-            label_index = self.labels_to_index[label]
-            self.label_weights[sample_index][label_index] += label_weight
-        return
-
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, sample_index
-        # Sorting the main heaps portion associated to `X[X_start:X_end]`
-        # in ascending order w.r.t the distances.
-        for idx in range(X_end - X_start):
-            simultaneous_sort(
-                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
-                self.heaps_indices_chunks[thread_num] + idx * self.k,
-                self.k
-            )
-            # One-pass top-one weighted mode
-            # Compute the absolute index in [0, n_samples_X)
-            sample_index = X_start + idx
-            max_label_weight = -1
-            self.weighted_histogram_mode(
-                sample_index,
-                &self.heaps_indices_chunks[thread_num][0],
-                &self.heaps_r_distances_chunks[thread_num][0],
-            )
-        return
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t sample_index, thread_num
-
-        with nogil, parallel(num_threads=self.chunks_n_threads):
-            # Deallocating temporary datastructures
-            for thread_num in prange(self.chunks_n_threads, schedule='static'):
-                free(self.heaps_r_distances_chunks[thread_num])
-                free(self.heaps_indices_chunks[thread_num])
-
-            # Sorting the main in ascending order w.r.t the distances.
-            # This is done in parallel sample-wise (no need for locks).
-            for sample_index in prange(self.n_samples_X, schedule='static'):
-                simultaneous_sort(
-                    &self.argkmin_distances[sample_index, 0],
-                    &self.argkmin_indices[sample_index, 0],
-                    self.k,
-                )
-                self.weighted_histogram_mode(
-                    sample_index,
-                    &self.argkmin_indices[sample_index][0],
-                    &self.argkmin_distances[sample_index][0],
-                )
-        return
-
-cdef class EuclideanArgKminLabels64(EuclideanArgKmin64):
-    """
-    64bit implementation of ArgKminLabel.
-    """
-    cdef:
-        const ITYPE_t[:] labels,
-        DTYPE_t[:, :] label_weights
-        cmap[ITYPE_t, ITYPE_t] labels_to_index
-        WeightingStrategy weight_type
-
-    def __init__(
-        self,
-        X,
-        Y,
-        ITYPE_t k,
-        bint use_squared_distances=False,
-        chunk_size=None,
-        strategy=None,
-        metric_kwargs=None,
-        weights=None,
-        labels=None,
-    ):
-        super().__init__(
-            X=X, Y=Y, k=k,
-            use_squared_distances=use_squared_distances,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            metric_kwargs=metric_kwargs,
-        )
-        if weights == "uniform":
-            self.weight_type = WeightingStrategy.uniform
-        elif weights == "distance":
-            self.weight_type = WeightingStrategy.distance
-        else:
-            self.weight_type = WeightingStrategy.other
-        self.labels = labels
-
-        cdef ITYPE_t[:] unique_labels = np.unique(labels)
-
-        cdef ITYPE_t idx, label
-        # Map from set of unique labels to their indices in `label_weights`
-        for idx, label in enumerate(unique_labels):
-            self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx))
-
-        # Buffer used in building a histogram for one-pass weighted mode
-        self.label_weights = np.zeros((self.n_samples_X,  len(unique_labels)), dtype=DTYPE)
-
-    def _finalize_results(self):
-        probabilities = np.asarray(self.label_weights)
-        probabilities /= probabilities.sum(axis=1, keepdims=True)
-        return probabilities
-
-    cdef inline void weighted_histogram_mode(
-        self,
-        ITYPE_t sample_index,
-        ITYPE_t* indices,
-        DTYPE_t* distances,
-   ) nogil:
-        cdef:
-            ITYPE_t y_idx, label, label_index, multi_output_index
-            DTYPE_t label_weight = 1
-
-        # Iterate through the sample k-nearest neighbours
-        for jdx in range(self.k):
-            # Absolute indice of the jdx-th Nearest Neighbors
-            # in range [0, n_samples_Y)
-            if self.weight_type == WeightingStrategy.distance:
-                label_weight = 1 / distances[jdx]
-            y_idx = indices[jdx]
-            label = self.labels[y_idx]
-            label_index = self.labels_to_index[label]
-            self.label_weights[sample_index][label_index] += label_weight
-        return
-
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, sample_index
-        # Sorting the main heaps portion associated to `X[X_start:X_end]`
-        # in ascending order w.r.t the distances.
-        for idx in range(X_end - X_start):
-            simultaneous_sort(
-                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
-                self.heaps_indices_chunks[thread_num] + idx * self.k,
-                self.k
-            )
-            # One-pass top-one weighted mode
-            # Compute the absolute index in [0, n_samples_X)
-            sample_index = X_start + idx
-            max_label_weight = -1
-            self.weighted_histogram_mode(
-                sample_index,
-                &self.heaps_indices_chunks[thread_num][0],
-                &self.heaps_r_distances_chunks[thread_num][0],
-            )
-        return
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t sample_index, thread_num
-
-        with nogil, parallel(num_threads=self.chunks_n_threads):
-            # Deallocating temporary datastructures
-            for thread_num in prange(self.chunks_n_threads, schedule='static'):
-                free(self.heaps_r_distances_chunks[thread_num])
-                free(self.heaps_indices_chunks[thread_num])
-
-            # Sorting the main in ascending order w.r.t the distances.
-            # This is done in parallel sample-wise (no need for locks).
-            for sample_index in prange(self.n_samples_X, schedule='static'):
-                simultaneous_sort(
-                    &self.argkmin_distances[sample_index, 0],
-                    &self.argkmin_indices[sample_index, 0],
-                    self.k,
-                )
-                self.weighted_histogram_mode(
-                    sample_index,
-                    &self.argkmin_indices[sample_index][0],
-                    &self.argkmin_distances[sample_index][0],
-                )
-        return
-from ._argkmin cimport ArgKmin32, EuclideanArgKmin32
-from ._datasets_pair cimport DatasetsPair32
-
-cdef class ArgKminLabels32(ArgKmin32):
-    """
-    32bit implementation of ArgKminLabel.
-    """
-    cdef:
-        const ITYPE_t[:] labels,
-        DTYPE_t[:, :] label_weights
-        cmap[ITYPE_t, ITYPE_t] labels_to_index
-        WeightingStrategy weight_type
-
-    @classmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        ITYPE_t k,
-        weights,
-        labels,
-        str metric="euclidean",
-        chunk_size=None,
-        dict metric_kwargs=None,
-        str strategy=None,
-    ):
-        """Compute the argkmin reduction.
-
-        This classmethod is responsible for introspecting the arguments
-        values to dispatch to the most appropriate implementation of
-        :class:`ArgKmin32`.
-
-        This allows decoupling the API entirely from the implementation details
-        whilst maintaining RAII: all temporarily allocated datastructures necessary
-        for the concrete implementation are therefore freed when this classmethod
-        returns.
-
-        No instance should directly be created outside of this class method.
-        """
-        if (
-            (
-                metric in ("euclidean", "sqeuclidean")
-                or metric=="minkowski" and (metric_kwargs is None or metric_kwargs.get("p", 2)==2)
-            )
-            and not (issparse(X) ^ issparse(Y))  # "^" is the XOR operator
-        ):
-            # Specialized implementation of ArgKminLabels for the Euclidean distance
-            # for the dense-dense and sparse-sparse cases.
-            # This implementation computes the distances by chunk using
-            # a decomposition of the Squared Euclidean distance.
-            # This specialisation has an improved arithmetic intensity for both
-            # the dense and sparse settings, allowing in most case speed-ups of
-            # several orders of magnitude compared to the generic ArgKmin
-            # implementation.
-            # For more information see MiddleTermComputer.
-            use_squared_distances = metric == "sqeuclidean"
-            pda = EuclideanArgKminLabels32(
-                X=X, Y=Y, k=k,
-                use_squared_distances=use_squared_distances,
-                chunk_size=chunk_size,
-                strategy=strategy,
-                metric_kwargs=metric_kwargs,
-                weights=weights,
-                labels=labels,
-            )
-        else:
-            # Fall back on a generic implementation that handles most scipy
-            # metrics by computing the distances between 2 vectors at a time.
-            pda = ArgKminLabels32(
-                datasets_pair=DatasetsPair32.get_for(X, Y, metric, metric_kwargs),
-                k=k,
-                chunk_size=chunk_size,
-                strategy=strategy,
-                weights=weights,
-                labels=labels,
-            )
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if pda.execute_in_parallel_on_Y:
-                pda._parallel_on_Y()
-            else:
-                pda._parallel_on_X()
-
-        return pda._finalize_results()
-
-    def __init__(
-        self,
-        DatasetsPair32 datasets_pair,
-        const ITYPE_t[:] labels,
-        chunk_size=None,
-        strategy=None,
-        ITYPE_t k=1,
-        weights=None,
-    ):
-        super().__init__(
-            datasets_pair=datasets_pair,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            k=k,
-        )
-
-        if weights == "uniform":
-            self.weight_type = WeightingStrategy.uniform
-        elif weights == "distance":
-            self.weight_type = WeightingStrategy.distance
-        else:
-            self.weight_type = WeightingStrategy.other
-        self.labels = labels
-
-        cdef ITYPE_t[:] unique_labels = np.unique(labels)
-
-        cdef ITYPE_t idx, label
-        # Map from set of unique labels to their indices in `label_weights`
-        for idx, label in enumerate(unique_labels):
-            self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx))
-
-        # Buffer used in building a histogram for one-pass weighted mode
-        self.label_weights = np.zeros((self.n_samples_X,  len(unique_labels)), dtype=DTYPE)
-
-    def _finalize_results(self):
-        probabilities = np.asarray(self.label_weights)
-        probabilities /= probabilities.sum(axis=1, keepdims=True)
-        return probabilities
-
-    cdef inline void weighted_histogram_mode(
-        self,
-        ITYPE_t sample_index,
-        ITYPE_t* indices,
-        DTYPE_t* distances,
-   ) nogil:
-        cdef:
-            ITYPE_t y_idx, label, label_index, multi_output_index
-            DTYPE_t label_weight = 1
-
-        # Iterate through the sample k-nearest neighbours
-        for jdx in range(self.k):
-            # Absolute indice of the jdx-th Nearest Neighbors
-            # in range [0, n_samples_Y)
-            if self.weight_type == WeightingStrategy.distance:
-                label_weight = 1 / distances[jdx]
-            y_idx = indices[jdx]
-            label = self.labels[y_idx]
-            label_index = self.labels_to_index[label]
-            self.label_weights[sample_index][label_index] += label_weight
-        return
-
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, sample_index
-        # Sorting the main heaps portion associated to `X[X_start:X_end]`
-        # in ascending order w.r.t the distances.
-        for idx in range(X_end - X_start):
-            simultaneous_sort(
-                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
-                self.heaps_indices_chunks[thread_num] + idx * self.k,
-                self.k
-            )
-            # One-pass top-one weighted mode
-            # Compute the absolute index in [0, n_samples_X)
-            sample_index = X_start + idx
-            max_label_weight = -1
-            self.weighted_histogram_mode(
-                sample_index,
-                &self.heaps_indices_chunks[thread_num][0],
-                &self.heaps_r_distances_chunks[thread_num][0],
-            )
-        return
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t sample_index, thread_num
-
-        with nogil, parallel(num_threads=self.chunks_n_threads):
-            # Deallocating temporary datastructures
-            for thread_num in prange(self.chunks_n_threads, schedule='static'):
-                free(self.heaps_r_distances_chunks[thread_num])
-                free(self.heaps_indices_chunks[thread_num])
-
-            # Sorting the main in ascending order w.r.t the distances.
-            # This is done in parallel sample-wise (no need for locks).
-            for sample_index in prange(self.n_samples_X, schedule='static'):
-                simultaneous_sort(
-                    &self.argkmin_distances[sample_index, 0],
-                    &self.argkmin_indices[sample_index, 0],
-                    self.k,
-                )
-                self.weighted_histogram_mode(
-                    sample_index,
-                    &self.argkmin_indices[sample_index][0],
-                    &self.argkmin_distances[sample_index][0],
-                )
-        return
-
-cdef class EuclideanArgKminLabels32(EuclideanArgKmin32):
-    """
-    32bit implementation of ArgKminLabel.
-    """
-    cdef:
-        const ITYPE_t[:] labels,
-        DTYPE_t[:, :] label_weights
-        cmap[ITYPE_t, ITYPE_t] labels_to_index
-        WeightingStrategy weight_type
-
-    def __init__(
-        self,
-        X,
-        Y,
-        ITYPE_t k,
-        bint use_squared_distances=False,
-        chunk_size=None,
-        strategy=None,
-        metric_kwargs=None,
-        weights=None,
-        labels=None,
-    ):
-        super().__init__(
-            X=X, Y=Y, k=k,
-            use_squared_distances=use_squared_distances,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            metric_kwargs=metric_kwargs,
-        )
-        if weights == "uniform":
-            self.weight_type = WeightingStrategy.uniform
-        elif weights == "distance":
-            self.weight_type = WeightingStrategy.distance
-        else:
-            self.weight_type = WeightingStrategy.other
-        self.labels = labels
-
-        cdef ITYPE_t[:] unique_labels = np.unique(labels)
-
-        cdef ITYPE_t idx, label
-        # Map from set of unique labels to their indices in `label_weights`
-        for idx, label in enumerate(unique_labels):
-            self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx))
-
-        # Buffer used in building a histogram for one-pass weighted mode
-        self.label_weights = np.zeros((self.n_samples_X,  len(unique_labels)), dtype=DTYPE)
-
-    def _finalize_results(self):
-        probabilities = np.asarray(self.label_weights)
-        probabilities /= probabilities.sum(axis=1, keepdims=True)
-        return probabilities
-
-    cdef inline void weighted_histogram_mode(
-        self,
-        ITYPE_t sample_index,
-        ITYPE_t* indices,
-        DTYPE_t* distances,
-   ) nogil:
-        cdef:
-            ITYPE_t y_idx, label, label_index, multi_output_index
-            DTYPE_t label_weight = 1
-
-        # Iterate through the sample k-nearest neighbours
-        for jdx in range(self.k):
-            # Absolute indice of the jdx-th Nearest Neighbors
-            # in range [0, n_samples_Y)
-            if self.weight_type == WeightingStrategy.distance:
-                label_weight = 1 / distances[jdx]
-            y_idx = indices[jdx]
-            label = self.labels[y_idx]
-            label_index = self.labels_to_index[label]
-            self.label_weights[sample_index][label_index] += label_weight
-        return
-
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, sample_index
-        # Sorting the main heaps portion associated to `X[X_start:X_end]`
-        # in ascending order w.r.t the distances.
-        for idx in range(X_end - X_start):
-            simultaneous_sort(
-                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
-                self.heaps_indices_chunks[thread_num] + idx * self.k,
-                self.k
-            )
-            # One-pass top-one weighted mode
-            # Compute the absolute index in [0, n_samples_X)
-            sample_index = X_start + idx
-            max_label_weight = -1
-            self.weighted_histogram_mode(
-                sample_index,
-                &self.heaps_indices_chunks[thread_num][0],
-                &self.heaps_r_distances_chunks[thread_num][0],
-            )
-        return
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t sample_index, thread_num
-
-        with nogil, parallel(num_threads=self.chunks_n_threads):
-            # Deallocating temporary datastructures
-            for thread_num in prange(self.chunks_n_threads, schedule='static'):
-                free(self.heaps_r_distances_chunks[thread_num])
-                free(self.heaps_indices_chunks[thread_num])
-
-            # Sorting the main in ascending order w.r.t the distances.
-            # This is done in parallel sample-wise (no need for locks).
-            for sample_index in prange(self.n_samples_X, schedule='static'):
-                simultaneous_sort(
-                    &self.argkmin_distances[sample_index, 0],
-                    &self.argkmin_indices[sample_index, 0],
-                    self.k,
-                )
-                self.weighted_histogram_mode(
-                    sample_index,
-                    &self.argkmin_indices[sample_index][0],
-                    &self.argkmin_distances[sample_index][0],
-                )
-        return
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd b/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd
deleted file mode 100644
index 33023cdb2a400..0000000000000
--- a/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd
+++ /dev/null
@@ -1,347 +0,0 @@
-cimport numpy as cnp
-
-from libcpp.vector cimport vector
-
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
-
-
-cdef void _middle_term_sparse_sparse_64(
-    const DTYPE_t[:] X_data,
-    const SPARSE_INDEX_TYPE_t[:] X_indices,
-    const SPARSE_INDEX_TYPE_t[:] X_indptr,
-    ITYPE_t X_start,
-    ITYPE_t X_end,
-    const DTYPE_t[:] Y_data,
-    const SPARSE_INDEX_TYPE_t[:] Y_indices,
-    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
-    ITYPE_t Y_start,
-    ITYPE_t Y_end,
-    DTYPE_t * D,
-) nogil
-
-cdef class BaseEngine:
-
-    cdef void _parallel_on_X_parallel_init(
-        self,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_X_parallel_finalize(
-        self,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef void _parallel_on_Y_synchronize(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-cdef class EuclideanEngine64(BaseEngine):
-    cdef:
-        ITYPE_t effective_n_threads
-        ITYPE_t chunks_n_threads
-        ITYPE_t dist_middle_terms_chunks_size
-        ITYPE_t n_features
-        ITYPE_t chunk_size
-        DTYPE_t[::1] X_norm_squared, Y_norm_squared
-
-        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
-        vector[vector[DTYPE_t]] dist_middle_terms_chunks
-
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_init(self) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef DTYPE_t _compute_pair_distance(
-        self,
-        ITYPE_t i, # Index of X sample
-        ITYPE_t j, # Index of Y sample
-        ITYPE_t X_start, # Index offset
-        ITYPE_t Y_start, # Index offset
-        DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms
-    ) nogil
-
-cdef class DenseDenseEuclideanEngine64(EuclideanEngine64):
-    cdef:
-        const DTYPE_t[:, ::1] X
-        const DTYPE_t[:, ::1] Y
-
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-
-cdef class SparseSparseEuclideanEngine64(EuclideanEngine64):
-    cdef:
-        const DTYPE_t[:] X_data
-        const SPARSE_INDEX_TYPE_t[:] X_indices
-        const SPARSE_INDEX_TYPE_t[:] X_indptr
-
-        const DTYPE_t[:] Y_data
-        const SPARSE_INDEX_TYPE_t[:] Y_indices
-        const SPARSE_INDEX_TYPE_t[:] Y_indptr
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-cdef class EuclideanEngine32(BaseEngine):
-    cdef:
-        ITYPE_t effective_n_threads
-        ITYPE_t chunks_n_threads
-        ITYPE_t dist_middle_terms_chunks_size
-        ITYPE_t n_features
-        ITYPE_t chunk_size
-        DTYPE_t[::1] X_norm_squared, Y_norm_squared
-
-        # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM
-        vector[vector[DTYPE_t]] dist_middle_terms_chunks
-
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_init(self) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef DTYPE_t _compute_pair_distance(
-        self,
-        ITYPE_t i, # Index of X sample
-        ITYPE_t j, # Index of Y sample
-        ITYPE_t X_start, # Index offset
-        ITYPE_t Y_start, # Index offset
-        DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms
-    ) nogil
-
-cdef class DenseDenseEuclideanEngine32(EuclideanEngine32):
-    cdef:
-        const cnp.float32_t[:, ::1] X
-        const cnp.float32_t[:, ::1] Y
-
-        # Buffers for upcasting chunks of X and Y from 32bit to 64bit
-        vector[vector[DTYPE_t]] X_c_upcast
-        vector[vector[DTYPE_t]] Y_c_upcast
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
-
-
-cdef class SparseSparseEuclideanEngine32(EuclideanEngine32):
-    cdef:
-        const DTYPE_t[:] X_data
-        const SPARSE_INDEX_TYPE_t[:] X_indices
-        const SPARSE_INDEX_TYPE_t[:] X_indptr
-
-        const DTYPE_t[:] Y_data
-        const SPARSE_INDEX_TYPE_t[:] Y_indices
-        const SPARSE_INDEX_TYPE_t[:] Y_indptr
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx b/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx
deleted file mode 100644
index 5e1fe8cb457b3..0000000000000
--- a/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx
+++ /dev/null
@@ -1,940 +0,0 @@
-cimport numpy as cnp
-
-from libcpp.vector cimport vector
-
-from ...utils._cython_blas cimport (
-  BLAS_Order,
-  BLAS_Trans,
-  NoTrans,
-  RowMajor,
-  Trans,
-  _gemm,
-)
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
-
-# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used
-# Introduction in Cython:
-#
-# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa
-cdef extern from "<algorithm>" namespace "std" nogil:
-    void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa
-
-import numpy as np
-from scipy.sparse import issparse, csr_matrix
-from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE
-from ...utils import check_array
-
-cdef class BaseEngine:
-    def __init__(self):
-        return
-
-    cdef void _parallel_on_X_parallel_init(
-        self,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_X_parallel_finalize(
-        self,
-        ITYPE_t thread_num
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_init(
-        self,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_synchronize(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
-# TODO: If possible optimize this routine to efficiently treat cases where
-# `n_samples_X << n_samples_Y` met in practise when X_test consists of a
-# few samples, and thus when there's a single chunk of X whose number of
-# samples is less that the default chunk size.
-
-# TODO: compare this routine with the similar ones in SciPy, especially
-# `csr_matmat` which might implement a better algorithm.
-# See: https://github.com/scipy/scipy/blob/e58292e066ba2cb2f3d1e0563ca9314ff1f4f311/scipy/sparse/sparsetools/csr.h#L603-L669  # noqa
-cdef void _middle_term_sparse_sparse_64(
-    const DTYPE_t[:] X_data,
-    const SPARSE_INDEX_TYPE_t[:] X_indices,
-    const SPARSE_INDEX_TYPE_t[:] X_indptr,
-    ITYPE_t X_start,
-    ITYPE_t X_end,
-    const DTYPE_t[:] Y_data,
-    const SPARSE_INDEX_TYPE_t[:] Y_indices,
-    const SPARSE_INDEX_TYPE_t[:] Y_indptr,
-    ITYPE_t Y_start,
-    ITYPE_t Y_end,
-    DTYPE_t * D,
-) nogil:
-    # This routine assumes that D points to the first element of a
-    # zeroed buffer of length at least equal to n_X × n_Y, conceptually
-    # representing a 2-d C-ordered array.
-    cdef:
-        ITYPE_t i, j, k
-        ITYPE_t n_X = X_end - X_start
-        ITYPE_t n_Y = Y_end - Y_start
-        ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
-
-    for i in range(n_X):
-        for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
-            X_i_col_idx = X_indices[X_i_ptr]
-            for j in range(n_Y):
-                k = i * n_Y + j
-                for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]):
-                    Y_j_col_idx = Y_indices[Y_j_ptr]
-                    if X_i_col_idx == Y_j_col_idx:
-                        D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr]
-
-
-from ._base cimport _sqeuclidean_row_norms64
-
-cdef class EuclideanEngine64(BaseEngine):
-    """Helper class to compute a Euclidean distance matrix in chunks.
-
-    This is an abstract base class that is further specialized depending
-    on the type of data (dense or sparse).
-
-    `EuclideanDistance` subclasses relies on the squared Euclidean
-    distances between chunks of vectors X_c and Y_c using the
-    following decomposition for the (i,j) pair :
-
-
-         ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-
-
-    This helper class is in charge of wrapping the common logic to compute
-    the middle term, i.e. `- 2 X_c_i.Y_c_j^T`.
-    """
-
-    @classmethod
-    def get_for(
-        cls,
-        X,
-        Y,
-        pda,
-    ) -> EuclideanEngine64:
-        """Return the DatasetsPair implementation for the given arguments.
-
-        Parameters
-        ----------
-        X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-
-        Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-
-        Returns
-        -------
-        engine: EuclideanEngine64
-            The suited EuclideanEngine64 implementation.
-        """
-        X_is_sparse = issparse(X)
-        Y_is_sparse = issparse(Y)
-        dist_middle_terms_chunks_size = pda.Y_n_samples_chunk * pda.X_n_samples_chunk
-        if not X_is_sparse and not Y_is_sparse:
-            return DenseDenseEuclideanEngine64(
-                X,
-                Y,
-                effective_n_threads=pda.effective_n_threads,
-                chunks_n_threads=pda.chunks_n_threads,
-                dist_middle_terms_chunks_size=dist_middle_terms_chunks_size,
-                chunk_size=pda.chunk_size,
-                metric_kwargs=pda.metric_kwargs,
-            )
-        if X_is_sparse and Y_is_sparse:
-            return SparseSparseEuclideanEngine64(
-                X,
-                Y,
-                effective_n_threads=pda.effective_n_threads,
-                chunks_n_threads=pda.chunks_n_threads,
-                dist_middle_terms_chunks_size=dist_middle_terms_chunks_size,
-                chunk_size=pda.chunk_size,
-                metric_kwargs=pda.metric_kwargs,
-            )
-
-        raise NotImplementedError(
-            "X and Y must be both CSR sparse matrices or both numpy arrays."
-        )
-
-
-    @classmethod
-    def unpack_csr_matrix(cls, X: csr_matrix):
-        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
-        X_data = np.asarray(X.data, dtype=DTYPE)
-        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
-        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
-        return X_data, X_indices, X_indptr
-
-    def __init__(
-        self,
-        X,
-        Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t chunk_size,
-        dict metric_kwargs=None,
-    ):
-        self.effective_n_threads = effective_n_threads
-        self.chunks_n_threads = chunks_n_threads
-        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
-        self.n_features = X.shape[1]
-        self.chunk_size = chunk_size
-
-        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
-
-        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
-            self.Y_norm_squared = check_array(
-                metric_kwargs.pop("Y_norm_squared"),
-                ensure_2d=False,
-                input_name="Y_norm_squared",
-                dtype=np.float64,
-            )
-        else:
-            self.Y_norm_squared = _sqeuclidean_row_norms64(
-                Y,
-                self.effective_n_threads,
-            )
-
-        if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
-            self.X_norm_squared = check_array(
-                metric_kwargs.pop("X_norm_squared"),
-                ensure_2d=False,
-                input_name="X_norm_squared",
-                dtype=np.float64,
-            )
-        else:
-            # Do not recompute norms if datasets are identical.
-            self.X_norm_squared = (
-                self.Y_norm_squared if X is Y else
-                _sqeuclidean_row_norms64(
-                    X,
-                    self.effective_n_threads,
-                )
-            )
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
-        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_init(self) nogil:
-        for thread_num in range(self.chunks_n_threads):
-            self.dist_middle_terms_chunks[thread_num].resize(
-                self.dist_middle_terms_chunks_size
-            )
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return NULL
-
-    cdef DTYPE_t _compute_pair_distance(
-        self,
-        ITYPE_t i, # Index of X sample
-        ITYPE_t j, # Index of Y sample
-        ITYPE_t X_start, # Index offset
-        ITYPE_t Y_start, # Index offset
-        DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms
-    ) nogil:
-
-        cdef ITYPE_t n_Y = len(self.Y_norm_squared)
-        # Index of middle term
-        cdef ITYPE_t k = n_Y * i + j
-        cdef DTYPE_t val = (
-            self.X_norm_squared[i + X_start] +
-            dist_middle_terms[i * n_Y + j] +
-            self.Y_norm_squared[j + Y_start]
-        )
-        # Catastrophic cancellation might cause -0. to be present,
-        # e.g. when computing d(x_i, y_i) when X is Y.
-        return max(0., val)
-
-
-cdef class DenseDenseEuclideanEngine64(EuclideanEngine64):
-    """Computes the middle term of the Euclidean distance between two chunked dense matrices
-    X_c and Y_c.
-
-                        dist_middle_terms = - 2 X_c_i.Y_c_j^T
-
-    This class use the BLAS gemm routine to perform the dot product of each chunks
-    of the distance matrix with improved arithmetic intensity and vector instruction (SIMD).
-    """
-
-    def __init__(
-        self,
-        const DTYPE_t[:, ::1] X,
-        const DTYPE_t[:, ::1] Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-        dict metric_kwargs=None,
-    ):
-        super().__init__(
-            X, Y,
-            effective_n_threads,
-            chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features,
-            chunk_size,
-            metric_kwargs=None,
-        )
-        self.X = X
-        self.Y = Y
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil:
-        return
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
-
-            # Careful: LDA, LDB and LDC are given for F-ordered arrays
-            # in BLAS documentations, for instance:
-            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
-            #
-            # Here, we use their counterpart values to work with C-ordered arrays.
-            BLAS_Order order = RowMajor
-            BLAS_Trans ta = NoTrans
-            BLAS_Trans tb = Trans
-            ITYPE_t m = X_end - X_start
-            ITYPE_t n = Y_end - Y_start
-            ITYPE_t K = self.n_features
-            DTYPE_t alpha = - 2.
-            # Casting for A and B to remove the const is needed because APIs exposed via
-            # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier.
-            # See: https://github.com/scipy/scipy/issues/14262
-            DTYPE_t * A = <DTYPE_t *> &self.X[X_start, 0]
-            DTYPE_t * B = <DTYPE_t *> &self.Y[Y_start, 0]
-            ITYPE_t lda = self.n_features
-            ITYPE_t ldb = self.n_features
-            DTYPE_t beta = 0.
-            ITYPE_t ldc = Y_end - Y_start
-
-        # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T`
-        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
-
-        return dist_middle_terms
-
-
-cdef class SparseSparseEuclideanEngine64(EuclideanEngine64):
-    """Middle term of the Euclidean distance between two chunked CSR matrices.
-
-    The result is return as a contiguous array.
-
-            dist_middle_terms = - 2 X_c_i.Y_c_j^T
-
-    The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64.
-    This routine iterates over the data, indices and indptr arrays of the sparse matrices without
-    densifying them.
-    """
-
-    def __init__(
-        self,
-        X,
-        Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        super().__init__(
-            X, Y,
-            effective_n_threads,
-            chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features,
-            chunk_size,
-            metric_kwargs=None,
-        )
-        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
-        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        # Flush the thread dist_middle_terms_chunks to 0.0
-        fill(
-            self.dist_middle_terms_chunks[thread_num].begin(),
-            self.dist_middle_terms_chunks[thread_num].end(),
-            0.0,
-        )
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        # Flush the thread dist_middle_terms_chunks to 0.0
-        fill(
-            self.dist_middle_terms_chunks[thread_num].begin(),
-            self.dist_middle_terms_chunks[thread_num].end(),
-            0.0,
-        )
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            DTYPE_t *dist_middle_terms = (
-                self.dist_middle_terms_chunks[thread_num].data()
-            )
-
-        _middle_term_sparse_sparse_64(
-            self.X_data,
-            self.X_indices,
-            self.X_indptr,
-            X_start,
-            X_end,
-            self.Y_data,
-            self.Y_indices,
-            self.Y_indptr,
-            Y_start,
-            Y_end,
-            dist_middle_terms,
-        )
-
-        return dist_middle_terms
-
-from ._base cimport _sqeuclidean_row_norms32
-
-cdef class EuclideanEngine32(BaseEngine):
-    """Helper class to compute a Euclidean distance matrix in chunks.
-
-    This is an abstract base class that is further specialized depending
-    on the type of data (dense or sparse).
-
-    `EuclideanDistance` subclasses relies on the squared Euclidean
-    distances between chunks of vectors X_c and Y_c using the
-    following decomposition for the (i,j) pair :
-
-
-         ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-
-
-    This helper class is in charge of wrapping the common logic to compute
-    the middle term, i.e. `- 2 X_c_i.Y_c_j^T`.
-    """
-
-    @classmethod
-    def get_for(
-        cls,
-        X,
-        Y,
-        pda,
-    ) -> EuclideanEngine32:
-        """Return the DatasetsPair implementation for the given arguments.
-
-        Parameters
-        ----------
-        X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-
-        Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features)
-            Input data.
-            If provided as a ndarray, it must be C-contiguous.
-
-        Returns
-        -------
-        engine: EuclideanEngine32
-            The suited EuclideanEngine32 implementation.
-        """
-        X_is_sparse = issparse(X)
-        Y_is_sparse = issparse(Y)
-        dist_middle_terms_chunks_size = pda.Y_n_samples_chunk * pda.X_n_samples_chunk
-        if not X_is_sparse and not Y_is_sparse:
-            return DenseDenseEuclideanEngine32(
-                X,
-                Y,
-                effective_n_threads=pda.effective_n_threads,
-                chunks_n_threads=pda.chunks_n_threads,
-                dist_middle_terms_chunks_size=dist_middle_terms_chunks_size,
-                chunk_size=pda.chunk_size,
-                metric_kwargs=pda.metric_kwargs,
-            )
-        if X_is_sparse and Y_is_sparse:
-            return SparseSparseEuclideanEngine32(
-                X,
-                Y,
-                effective_n_threads=pda.effective_n_threads,
-                chunks_n_threads=pda.chunks_n_threads,
-                dist_middle_terms_chunks_size=dist_middle_terms_chunks_size,
-                chunk_size=pda.chunk_size,
-                metric_kwargs=pda.metric_kwargs,
-            )
-
-        raise NotImplementedError(
-            "X and Y must be both CSR sparse matrices or both numpy arrays."
-        )
-
-
-    @classmethod
-    def unpack_csr_matrix(cls, X: csr_matrix):
-        """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
-        X_data = np.asarray(X.data, dtype=DTYPE)
-        X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE)
-        X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE)
-        return X_data, X_indices, X_indptr
-
-    def __init__(
-        self,
-        X,
-        Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t chunk_size,
-        dict metric_kwargs=None,
-    ):
-        self.effective_n_threads = effective_n_threads
-        self.chunks_n_threads = chunks_n_threads
-        self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size
-        self.n_features = X.shape[1]
-        self.chunk_size = chunk_size
-
-        self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads)
-
-        if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs:
-            self.Y_norm_squared = check_array(
-                metric_kwargs.pop("Y_norm_squared"),
-                ensure_2d=False,
-                input_name="Y_norm_squared",
-                dtype=np.float64,
-            )
-        else:
-            self.Y_norm_squared = _sqeuclidean_row_norms32(
-                Y,
-                self.effective_n_threads,
-            )
-
-        if metric_kwargs is not None and "X_norm_squared" in metric_kwargs:
-            self.X_norm_squared = check_array(
-                metric_kwargs.pop("X_norm_squared"),
-                ensure_2d=False,
-                input_name="X_norm_squared",
-                dtype=np.float64,
-            )
-        else:
-            # Do not recompute norms if datasets are identical.
-            self.X_norm_squared = (
-                self.Y_norm_squared if X is Y else
-                _sqeuclidean_row_norms32(
-                    X,
-                    self.effective_n_threads,
-                )
-            )
-
-    cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil:
-        self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size)
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        return
-
-    cdef void _parallel_on_Y_init(self) nogil:
-        for thread_num in range(self.chunks_n_threads):
-            self.dist_middle_terms_chunks[thread_num].resize(
-                self.dist_middle_terms_chunks_size
-            )
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        return NULL
-
-    cdef DTYPE_t _compute_pair_distance(
-        self,
-        ITYPE_t i, # Index of X sample
-        ITYPE_t j, # Index of Y sample
-        ITYPE_t X_start, # Index offset
-        ITYPE_t Y_start, # Index offset
-        DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms
-    ) nogil:
-
-        cdef ITYPE_t n_Y = len(self.Y_norm_squared)
-        # Index of middle term
-        cdef ITYPE_t k = n_Y * i + j
-        cdef DTYPE_t val = (
-            self.X_norm_squared[i + X_start] +
-            dist_middle_terms[i * n_Y + j] +
-            self.Y_norm_squared[j + Y_start]
-        )
-        # Catastrophic cancellation might cause -0. to be present,
-        # e.g. when computing d(x_i, y_i) when X is Y.
-        return max(0., val)
-
-
-cdef class DenseDenseEuclideanEngine32(EuclideanEngine32):
-    """Computes the middle term of the Euclidean distance between two chunked dense matrices
-    X_c and Y_c.
-
-                        dist_middle_terms = - 2 X_c_i.Y_c_j^T
-
-    This class use the BLAS gemm routine to perform the dot product of each chunks
-    of the distance matrix with improved arithmetic intensity and vector instruction (SIMD).
-    """
-
-    def __init__(
-        self,
-        const cnp.float32_t[:, ::1] X,
-        const cnp.float32_t[:, ::1] Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-        dict metric_kwargs=None,
-    ):
-        super().__init__(
-            X, Y,
-            effective_n_threads,
-            chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features,
-            chunk_size,
-            metric_kwargs=None,
-        )
-        self.X = X
-        self.Y = Y
-        # We populate the buffer for upcasting chunks of X and Y from float32 to float64.
-        self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
-        self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads)
-
-        upcast_buffer_n_elements = self.chunk_size * n_features
-
-        for thread_num in range(self.effective_n_threads):
-            self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements)
-            self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements)
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = Y_end - Y_start
-
-        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
-
-    cdef void _parallel_on_X_init_chunk(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = X_end - X_start
-
-        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
-
-    cdef void _parallel_on_Y_parallel_init(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = X_end - X_start
-
-        # Upcasting X_c=X[X_start:X_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.X_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.X[X_start + i, j]
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num
-    ) nogil:
-        cdef:
-            ITYPE_t i, j
-            ITYPE_t n_chunk_samples = Y_end - Y_start
-
-        # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64
-        for i in range(n_chunk_samples):
-            for j in range(self.n_features):
-                self.Y_c_upcast[thread_num][i * self.n_features + j] = <DTYPE_t> self.Y[Y_start + i, j]
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data()
-
-            # Careful: LDA, LDB and LDC are given for F-ordered arrays
-            # in BLAS documentations, for instance:
-            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa
-            #
-            # Here, we use their counterpart values to work with C-ordered arrays.
-            BLAS_Order order = RowMajor
-            BLAS_Trans ta = NoTrans
-            BLAS_Trans tb = Trans
-            ITYPE_t m = X_end - X_start
-            ITYPE_t n = Y_end - Y_start
-            ITYPE_t K = self.n_features
-            DTYPE_t alpha = - 2.
-            DTYPE_t * A = self.X_c_upcast[thread_num].data()
-            DTYPE_t * B = self.Y_c_upcast[thread_num].data()
-            ITYPE_t lda = self.n_features
-            ITYPE_t ldb = self.n_features
-            DTYPE_t beta = 0.
-            ITYPE_t ldc = Y_end - Y_start
-
-        # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T`
-        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc)
-
-        return dist_middle_terms
-
-
-cdef class SparseSparseEuclideanEngine32(EuclideanEngine32):
-    """Middle term of the Euclidean distance between two chunked CSR matrices.
-
-    The result is return as a contiguous array.
-
-            dist_middle_terms = - 2 X_c_i.Y_c_j^T
-
-    The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64.
-    This routine iterates over the data, indices and indptr arrays of the sparse matrices without
-    densifying them.
-    """
-
-    def __init__(
-        self,
-        X,
-        Y,
-        ITYPE_t effective_n_threads,
-        ITYPE_t chunks_n_threads,
-        ITYPE_t dist_middle_terms_chunks_size,
-        ITYPE_t n_features,
-        ITYPE_t chunk_size,
-    ):
-        super().__init__(
-            X, Y,
-            effective_n_threads,
-            chunks_n_threads,
-            dist_middle_terms_chunks_size,
-            n_features,
-            chunk_size,
-            metric_kwargs=None,
-        )
-        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
-        self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
-
-    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        # Flush the thread dist_middle_terms_chunks to 0.0
-        fill(
-            self.dist_middle_terms_chunks[thread_num].begin(),
-            self.dist_middle_terms_chunks[thread_num].end(),
-            0.0,
-        )
-
-    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        # Flush the thread dist_middle_terms_chunks to 0.0
-        fill(
-            self.dist_middle_terms_chunks[thread_num].begin(),
-            self.dist_middle_terms_chunks[thread_num].end(),
-            0.0,
-        )
-
-    cdef DTYPE_t * _compute_dist_middle_terms(
-        self,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-        ITYPE_t Y_start,
-        ITYPE_t Y_end,
-        ITYPE_t thread_num,
-    ) nogil:
-        cdef:
-            DTYPE_t *dist_middle_terms = (
-                self.dist_middle_terms_chunks[thread_num].data()
-            )
-
-        _middle_term_sparse_sparse_64(
-            self.X_data,
-            self.X_indices,
-            self.X_indptr,
-            X_start,
-            X_end,
-            self.Y_data,
-            self.Y_indices,
-            self.Y_indptr,
-            Y_start,
-            Y_end,
-            dist_middle_terms,
-        )
-
-        return dist_middle_terms

From 6bcc83d249d957cb781844b3f350530247107b61 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Mon, 19 Dec 2022 13:06:52 -0500
Subject: [PATCH 22/25] Removed excess files and removed prange

---
 sklearn/cluster/_hdbscan/_reachability.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx
index efc641df29e19..dc4263694f89a 100644
--- a/sklearn/cluster/_hdbscan/_reachability.pyx
+++ b/sklearn/cluster/_hdbscan/_reachability.pyx
@@ -10,7 +10,6 @@ cimport numpy as cnp
 import numpy as np
 from scipy.sparse import issparse
 from cython cimport floating, integral
-from cython.parallel cimport prange
 from libc.math cimport isfinite, INFINITY
 
 cnp.import_array()
@@ -111,7 +110,9 @@ def _dense_mutual_reachability_graph(
     )
 
     with nogil:
-        for i in prange(n_samples):
+        # TODO: Update w/ prange with thread count based on
+        # _openmp_effective_n_threads
+        for i in range(n_samples):
             for j in range(n_samples):
                 mutual_reachibility_distance = max(
                     core_distances[i],

From 9715b94242e738eeb4f60546ee38834fe0dccbc0 Mon Sep 17 00:00:00 2001
From: Meekail Zain <34613774+Micky774@users.noreply.github.com>
Date: Fri, 3 Feb 2023 19:50:01 -0500
Subject: [PATCH 23/25] Update sklearn/cluster/_hdbscan/hdbscan.py

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/cluster/_hdbscan/hdbscan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 7ec7ad56cd177..e7b22aa7aca97 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -137,7 +137,7 @@ def _hdbscan_brute(
     distance_matrix /= alpha
 
     max_distance = metric_params.get("max_distance", 0.0)
-    if isspmatrix_csr(distance_matrix):
+    if issparse(distance_matrix) and distance_matrix.format != "csr":
         # we need CSR format to avoid a conversion in `_brute_mst` when calling
         # `csgraph.connected_components`
         distance_matrix = distance_matrix.tocsr()

From 4667952548fea4f8777f77ed4dc147885ee22af7 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Fri, 3 Feb 2023 19:55:45 -0500
Subject: [PATCH 24/25] Updated test to include value errors on assymetric
 distance matrices

---
 sklearn/cluster/tests/test_hdbscan.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index 65f9829ffef5c..0a7704bcd85cb 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -78,6 +78,17 @@ def test_hdbscan_distance_matrix():
     score = fowlkes_mallows_score(y, labels)
     assert score >= 0.98
 
+    msg = r"The precomputed distance matrix.*has shape"
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="precomputed", copy=True).fit_predict(X)
+
+    msg = r"The precomputed distance matrix.*values"
+    # Ensure the matrix is not symmetric
+    D[0, 1] = 10
+    D[1, 0] = 1
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="precomputed").fit_predict(D)
+
 
 def test_hdbscan_sparse_distance_matrix():
     D = distance.squareform(distance.pdist(X))

From ad83829c62957e2409a5efde7a3868c61b82d4f6 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Mon, 13 Feb 2023 18:13:09 -0500
Subject: [PATCH 25/25] Updated sparse distance matrix test

---
 sklearn/cluster/tests/test_hdbscan.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index 0a7704bcd85cb..36fe8e5a6158c 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -90,14 +90,15 @@ def test_hdbscan_distance_matrix():
         HDBSCAN(metric="precomputed").fit_predict(D)
 
 
-def test_hdbscan_sparse_distance_matrix():
+@pytest.mark.parametrize("sparse_constructor", [sparse.csr_matrix, sparse.csc_matrix])
+def test_hdbscan_sparse_distance_matrix(sparse_constructor):
     D = distance.squareform(distance.pdist(X))
     D /= np.max(D)
 
     threshold = stats.scoreatpercentile(D.flatten(), 50)
 
     D[D >= threshold] = 0.0
-    D = sparse.csr_matrix(D)
+    D = sparse_constructor(D)
     D.eliminate_zeros()
 
     labels = HDBSCAN(metric="precomputed").fit_predict(D)