ENH Add the fused CSR dense case for Euclidean Specializations (#25044)

jjerphan · Vincent-Maladiere · web-flow · commit 67ea7206bc05 · 2023-03-10T13:25:10.000+01:00
Signed-off-by: Julien Jerphanion &lt;git@jjerphan.xyz&gt;
Co-authored-by: Vincent M &lt;maladiere.vincent@yahoo.fr&gt;
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -81,6 +81,42 @@ Changes impacting all modules
   by :user:`John Pangas <jpangas>`, :user:`Rahil Parikh <rprkh>` ,
   and :user:`Alex Buzenet <albuzenet>`.
 
+- |Enhancement| Added a multi-threaded Cython routine to the compute squared
+  Euclidean distances (sometimes followed by a fused reduction operation) for a
+  pair of datasets consisting of a sparse CSR matrix and a dense NumPy.
+
+  This can improve the performance of following functions and estimators:
+
+  - :func:`sklearn.metrics.pairwise_distances_argmin`
+  - :func:`sklearn.metrics.pairwise_distances_argmin_min`
+  - :class:`sklearn.cluster.AffinityPropagation`
+  - :class:`sklearn.cluster.Birch`
+  - :class:`sklearn.cluster.MeanShift`
+  - :class:`sklearn.cluster.OPTICS`
+  - :class:`sklearn.cluster.SpectralClustering`
+  - :func:`sklearn.feature_selection.mutual_info_regression`
+  - :class:`sklearn.neighbors.KNeighborsClassifier`
+  - :class:`sklearn.neighbors.KNeighborsRegressor`
+  - :class:`sklearn.neighbors.RadiusNeighborsClassifier`
+  - :class:`sklearn.neighbors.RadiusNeighborsRegressor`
+  - :class:`sklearn.neighbors.LocalOutlierFactor`
+  - :class:`sklearn.neighbors.NearestNeighbors`
+  - :class:`sklearn.manifold.Isomap`
+  - :class:`sklearn.manifold.LocallyLinearEmbedding`
+  - :class:`sklearn.manifold.TSNE`
+  - :func:`sklearn.manifold.trustworthiness`
+  - :class:`sklearn.semi_supervised.LabelPropagation`
+  - :class:`sklearn.semi_supervised.LabelSpreading`
+
+  A typical example of this performance improvement happens when passing a sparse
+  CSR matrix to the `predict` or `transform` method of estimators that rely on
+  a dense NumPy representation to store their fitted parameters (or the reverse).
+
+  For instance, :meth:`sklearn.NearestNeighbors.kneighbors` is now up to 2 times faster
+  for this case on commonly available laptops.
+
+  :pr:`25044` by :user:`Julien Jerphanion <jjerphan>`.
+
 Changelog
 ---------
 
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -61,10 +61,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
 
         No instance should directly be created outside of this class method.
         """
-        if (
-            metric in ("euclidean", "sqeuclidean")
-            and not (issparse(X) ^ issparse(Y))  # "^" is the XOR operator
-        ):
+        if metric in ("euclidean", "sqeuclidean"):
             # Specialized implementation of ArgKmin for the Euclidean distance
             # for the dense-dense and sparse-sparse cases.
             # This implementation computes the distances by chunk using
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -119,26 +119,7 @@ def is_valid_sparse_matrix(X):
             and metric in cls.valid_metrics()
         )
 
-        # The other joblib-based back-end might be more efficient on fused sparse-dense
-        # datasets' pairs on metric="(sq)euclidean" for some configurations because it
-        # uses the Squared Euclidean matrix decomposition, i.e.:
-        #
-        #       ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-        #
-        # calling efficient sparse-dense routines for matrix and vectors multiplication
-        # implemented in SciPy we do not use yet here.
-        # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669  # noqa
-        # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
-        # using sparse-dense routines for matrix-vector multiplications.
-        # Currently, only dense-dense and sparse-sparse are optimized for
-        # the Euclidean case.
-        fused_sparse_dense_euclidean_case_guard = not (
-            (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y))  # "^" is XOR
-            and isinstance(metric, str)
-            and "euclidean" in metric
-        )
-
-        return is_usable and fused_sparse_dense_euclidean_case_guard
+        return is_usable
 
     @classmethod
     @abstractmethod
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
@@ -186,4 +186,45 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
     ) noexcept nogil
 
 
+cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    cdef:
+        const DTYPE_t[:] X_data
+        const SPARSE_INDEX_TYPE_t[:] X_indices
+        const SPARSE_INDEX_TYPE_t[:] X_indptr
+
+        const {{INPUT_DTYPE_t}}[:, ::1] Y
+
+        # We treat the dense-sparse case with the sparse-dense case by simply
+        # treating the dist_middle_terms as F-ordered and by swapping arguments.
+        # This attribute is meant to encode the case and adapt the logic
+        # accordingly.
+        bint c_ordered_middle_term
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) noexcept nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) noexcept nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) noexcept nogil
+
 {{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -73,6 +73,34 @@ cdef void _middle_term_sparse_sparse_64(
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
+cdef void _middle_term_sparse_dense_{{name_suffix}}(
+    const DTYPE_t[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indices,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const {{INPUT_DTYPE_t}}[:, ::1] Y,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    bint c_ordered_middle_term,
+    DTYPE_t * dist_middle_terms,
+) nogil:
+    # This routine assumes that dist_middle_terms is a pointer to the first element
+    # of a buffer filled with zeros of length at least equal to n_X × n_Y, conceptually
+    # representing a 2-d C-ordered of F-ordered array.
+    cdef:
+        ITYPE_t i, j, k
+        ITYPE_t n_X = X_end - X_start
+        ITYPE_t n_Y = Y_end - Y_start
+        ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
+
+    for i in range(n_X):
+        for j in range(n_Y):
+            k = i * n_Y + j if c_ordered_middle_term else j * n_X + i
+            for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
+                X_i_col_idx = X_indices[X_i_ptr]
+                dist_middle_terms[k] += -2 * X_data[X_i_ptr] * Y[Y_start + j, X_i_col_idx]
+
 
 cdef class MiddleTermComputer{{name_suffix}}:
     """Helper class to compute a Euclidean distance matrix in chunks.
@@ -103,7 +131,7 @@ cdef class MiddleTermComputer{{name_suffix}}:
         n_features,
         chunk_size,
     ) -> MiddleTermComputer{{name_suffix}}:
-        """Return the DatasetsPair implementation for the given arguments.
+        """Return the MiddleTermComputer implementation for the given arguments.
 
         Parameters
         ----------
@@ -143,12 +171,39 @@ cdef class MiddleTermComputer{{name_suffix}}:
                 n_features,
                 chunk_size,
             )
-
+        if X_is_sparse and not Y_is_sparse:
+            return SparseDenseMiddleTermComputer{{name_suffix}}(
+                X,
+                Y,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+                c_ordered_middle_term=True
+            )
+        if not X_is_sparse and Y_is_sparse:
+            # NOTE: The Dense-Sparse case is implement via the Sparse-Dense case.
+            #
+            # To do so:
+            #    - X (dense) and Y (sparse) are swapped
+            #    - the distance middle term is seen as F-ordered for consistency
+            #      (c_ordered_middle_term = False)
+            return SparseDenseMiddleTermComputer{{name_suffix}}(
+                # Mind that X and Y are swapped here.
+                Y,
+                X,
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+                c_ordered_middle_term=False,
+            )
         raise NotImplementedError(
-            "X and Y must be both CSR sparse matrices or both numpy arrays."
+            "X and Y must be CSR sparse matrices or numpy arrays."
         )
 
-
     @classmethod
     def unpack_csr_matrix(cls, X: csr_matrix):
         """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
@@ -486,5 +541,101 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
 
         return dist_middle_terms
 
+cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    """Middle term of the Euclidean distance between chunks of a CSR matrix and a np.ndarray.
+
+    The logic of the computation is wrapped in the routine _middle_term_sparse_dense_{{name_suffix}}.
+    This routine iterates over the data, indices and indptr arrays of the sparse matrices
+    without densifying them.
+    """
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+        bint c_ordered_middle_term,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y = Y
+        self.c_ordered_middle_term = c_ordered_middle_term
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) noexcept nogil:
+        # Fill the thread's dist_middle_terms_chunks with 0.0 before
+        # computing its elements in _compute_dist_middle_terms.
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) noexcept nogil:
+        # Fill the thread's dist_middle_terms_chunks with 0.0 before
+        # computing its elements in _compute_dist_middle_terms.
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) noexcept nogil:
+        cdef:
+            DTYPE_t *dist_middle_terms = (
+                self.dist_middle_terms_chunks[thread_num].data()
+            )
+
+        # For the dense-sparse case, we use the sparse-dense case
+        # with dist_middle_terms seen as F-ordered.
+        # Hence we swap indices pointers here.
+        if not self.c_ordered_middle_term:
+            X_start, Y_start = Y_start, X_start
+            X_end, Y_end = Y_end, X_end
+
+        _middle_term_sparse_dense_{{name_suffix}}(
+            self.X_data,
+            self.X_indices,
+            self.X_indptr,
+            X_start,
+            X_end,
+            self.Y,
+            Y_start,
+            Y_end,
+            self.c_ordered_middle_term,
+            dist_middle_terms,
+        )
+
+        return dist_middle_terms
 
 {{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -82,10 +82,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
 
         No instance should directly be created outside of this class method.
         """
-        if (
-            metric in ("euclidean", "sqeuclidean")
-            and not (issparse(X) ^ issparse(Y))  # "^" is XOR
-        ):
+        if metric in ("euclidean", "sqeuclidean"):
             # Specialized implementation of RadiusNeighbors for the Euclidean
             # distance for the dense-dense and sparse-sparse cases.
             # This implementation computes the distances by chunk using
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -553,15 +553,11 @@ def test_pairwise_distances_reduction_is_usable_for():
         np.asfortranarray(X), Y, metric
     )
 
-    # We prefer not to use those implementations for fused sparse-dense when
-    # metric="(sq)euclidean" because it's not yet the most efficient one on
-    # all configurations of datasets.
-    # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669  # noqa
-    # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
-    # using sparse-dense routines for matrix-vector multiplications.
-    assert not BaseDistancesReductionDispatcher.is_usable_for(
-        X_csr, Y, metric="euclidean"
+    assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric="euclidean")
+    assert BaseDistancesReductionDispatcher.is_usable_for(
+        X, Y_csr, metric="sqeuclidean"
     )
+
     assert BaseDistancesReductionDispatcher.is_usable_for(
         X_csr, Y_csr, metric="sqeuclidean"
     )
@@ -1060,7 +1056,7 @@ def test_pairwise_distances_argkmin(
             row_idx, argkmin_indices_ref[row_idx]
         ]
 
-    for _X, _Y in [(X, Y), (X_csr, Y_csr)]:
+    for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)):
         argkmin_distances, argkmin_indices = ArgKmin.compute(
             _X,
             _Y,