ENH Add the fused CSR dense case for Euclidean Specializations

jjerphan · jjerphan · commit a943f79f9587 · 2022-11-25T17:00:33.000+01:00
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -61,10 +61,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
 
         No instance should directly be created outside of this class method.
         """
-        if (
-            metric in ("euclidean", "sqeuclidean")
-            and not (issparse(X) ^ issparse(Y))  # "^" is the XOR operator
-        ):
+        if metric in ("euclidean", "sqeuclidean"):
             # Specialized implementation of ArgKmin for the Euclidean distance
             # for the dense-dense and sparse-sparse cases.
             # This implementation computes the distances by chunk using
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -119,26 +119,7 @@ def is_valid_sparse_matrix(X):
             and metric in cls.valid_metrics()
         )
 
-        # The other joblib-based back-end might be more efficient on fused sparse-dense
-        # datasets' pairs on metric="(sq)euclidean" for some configurations because it
-        # uses the Squared Euclidean matrix decomposition, i.e.:
-        #
-        #       ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-        #
-        # calling efficient sparse-dense routines for matrix and vectors multiplication
-        # implemented in SciPy we do not use yet here.
-        # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669  # noqa
-        # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
-        # using sparse-dense routines for matrix-vector multiplications.
-        # Currently, only dense-dense and sparse-sparse are optimized for
-        # the Euclidean case.
-        fused_sparse_dense_euclidean_case_guard = not (
-            (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y))  # "^" is XOR
-            and isinstance(metric, str)
-            and "euclidean" in metric
-        )
-
-        return is_usable and fused_sparse_dense_euclidean_case_guard
+        return is_usable
 
     @classmethod
     @abstractmethod
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp
@@ -186,4 +186,41 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
     ) nogil
 
 
+cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    cdef:
+        const DTYPE_t[:] X_data
+        const SPARSE_INDEX_TYPE_t[:] X_indices
+        const SPARSE_INDEX_TYPE_t[:] X_indptr
+
+        const DTYPE_t[:, ::1] Y
+
+        bint c_ordered_middle_term
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num
+    ) nogil
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil
+
 {{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
@@ -78,6 +78,37 @@ cdef void _middle_term_sparse_sparse_64(
                     if X_i_col_idx == Y_j_col_idx:
                         D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr]
 
+# TODO: compare this routine with the similar ones in SciPy, especially
+# `csr_matvects` which might implement a better algorithm.
+# See: https://github.com/scipy/scipy/blob/e58292e066ba2cb2f3d1e0563ca9314ff1f4f311/scipy/sparse/sparsetools/csr.h#L1139-L1175  # noqa
+cdef void _middle_term_sparse_dense_64(
+    const DTYPE_t[:] X_data,
+    const SPARSE_INDEX_TYPE_t[:] X_indices,
+    const SPARSE_INDEX_TYPE_t[:] X_indptr,
+    ITYPE_t X_start,
+    ITYPE_t X_end,
+    const DTYPE_t[:, ::1] Y,
+    ITYPE_t Y_start,
+    ITYPE_t Y_end,
+    bint c_ordered_middle_term,
+    DTYPE_t * D,
+) nogil:
+    # This routine assumes that D points to the first element of a
+    # zeroed buffer of length at least equal to n_X × n_Y, conceptually
+    # representing a 2-d C-ordered array.
+    cdef:
+        ITYPE_t i, j, k
+        ITYPE_t n_X = X_end - X_start
+        ITYPE_t n_Y = Y_end - Y_start
+        ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr
+
+    for i in range(n_X):
+        for j in range(n_Y):
+            k = i * n_Y + j if c_ordered_middle_term else j * n_X + i
+            for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]):
+                X_i_col_idx = X_indices[X_i_ptr]
+                D[k] += -2 * X_data[X_i_ptr] * Y[Y_start + j, X_i_col_idx]
+
 
 {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
@@ -111,7 +142,7 @@ cdef class MiddleTermComputer{{name_suffix}}:
         n_features,
         chunk_size,
     ) -> MiddleTermComputer{{name_suffix}}:
-        """Return the DatasetsPair implementation for the given arguments.
+        """Return the MiddleTermComputer implementation for the given arguments.
 
         Parameters
         ----------
@@ -151,12 +182,34 @@ cdef class MiddleTermComputer{{name_suffix}}:
                 n_features,
                 chunk_size,
             )
-
+        if X_is_sparse and not Y_is_sparse:
+            return SparseDenseMiddleTermComputer{{name_suffix}}(
+                X,
+                # TODO: remove cast
+                Y.astype(np.float64, copy=False),
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+                c_ordered_middle_term=True
+            )
+        if not X_is_sparse and Y_is_sparse:
+            return SparseDenseMiddleTermComputer{{name_suffix}}(
+                Y,
+                # TODO: remove cast
+                X.astype(np.float64, copy=False),
+                effective_n_threads,
+                chunks_n_threads,
+                dist_middle_terms_chunks_size,
+                n_features,
+                chunk_size,
+                c_ordered_middle_term=False,
+            )
         raise NotImplementedError(
-            "X and Y must be both CSR sparse matrices or both numpy arrays."
+            "X and Y must be CSR sparse matrices or numpy arrays."
         )
 
-
     @classmethod
     def unpack_csr_matrix(cls, X: csr_matrix):
         """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE."""
@@ -494,5 +547,92 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam
 
         return dist_middle_terms
 
+cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}):
+    """Middle term of the Euclidean distance between chunks of a CSR matrix and a np.ndarray.
+
+    The logic of the computation is wrapped in the routine _middle_term_sparse_dense_64.
+    This routine iterates over the data, indices and indptr arrays of the sparse matrices
+    without densifying them.
+    """
+
+    def __init__(
+        self,
+        X,
+        Y,
+        ITYPE_t effective_n_threads,
+        ITYPE_t chunks_n_threads,
+        ITYPE_t dist_middle_terms_chunks_size,
+        ITYPE_t n_features,
+        ITYPE_t chunk_size,
+        bint c_ordered_middle_term,
+    ):
+        super().__init__(
+            effective_n_threads,
+            chunks_n_threads,
+            dist_middle_terms_chunks_size,
+            n_features,
+            chunk_size,
+        )
+        self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
+        self.Y = Y
+        self.c_ordered_middle_term = c_ordered_middle_term
+
+    cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        # Flush the thread dist_middle_terms_chunks to 0.0
+        fill(
+            self.dist_middle_terms_chunks[thread_num].begin(),
+            self.dist_middle_terms_chunks[thread_num].end(),
+            0.0,
+        )
+
+    cdef DTYPE_t * _compute_dist_middle_terms(
+        self,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            DTYPE_t *dist_middle_terms = (
+                self.dist_middle_terms_chunks[thread_num].data()
+            )
+
+        _middle_term_sparse_dense_64(
+            self.X_data,
+            self.X_indices,
+            self.X_indptr,
+            X_start if self.c_ordered_middle_term else Y_start,
+            X_end if self.c_ordered_middle_term else Y_end,
+            self.Y,
+            Y_start if self.c_ordered_middle_term else X_start,
+            Y_end if self.c_ordered_middle_term else X_end,
+            self.c_ordered_middle_term,
+            dist_middle_terms,
+        )
+
+        return dist_middle_terms
 
 {{endfor}}
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -82,10 +82,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
 
         No instance should directly be created outside of this class method.
         """
-        if (
-            metric in ("euclidean", "sqeuclidean")
-            and not (issparse(X) ^ issparse(Y))  # "^" is XOR
-        ):
+        if metric in ("euclidean", "sqeuclidean"):
             # Specialized implementation of RadiusNeighbors for the Euclidean
             # distance for the dense-dense and sparse-sparse cases.
             # This implementation computes the distances by chunk using
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1,4 +1,3 @@
-import itertools
 import re
 from collections import defaultdict
 
@@ -552,15 +551,11 @@ def test_pairwise_distances_reduction_is_usable_for():
         np.asfortranarray(X), Y, metric
     )
 
-    # We prefer not to use those implementations for fused sparse-dense when
-    # metric="(sq)euclidean" because it's not yet the most efficient one on
-    # all configurations of datasets.
-    # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669  # noqa
-    # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
-    # using sparse-dense routines for matrix-vector multiplications.
-    assert not BaseDistancesReductionDispatcher.is_usable_for(
-        X_csr, Y, metric="euclidean"
+    assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric="euclidean")
+    assert BaseDistancesReductionDispatcher.is_usable_for(
+        X, Y_csr, metric="sqeuclidean"
     )
+
     assert BaseDistancesReductionDispatcher.is_usable_for(
         X_csr, Y_csr, metric="sqeuclidean"
     )
@@ -848,24 +843,53 @@ def test_format_agnosticism(
         **compute_parameters,
     )
 
-    for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)):
-        if _X is X and _Y is Y:
-            continue
-        dist, indices = Dispatcher.compute(
-            _X,
-            _Y,
-            parameter,
-            chunk_size=50,
-            return_distance=True,
-            **compute_parameters,
-        )
-        ASSERT_RESULT[(Dispatcher, dtype)](
-            dist_dense,
-            dist,
-            indices_dense,
-            indices,
-            **check_parameters,
-        )
+    dist, indices = Dispatcher.compute(
+        X_csr,
+        Y_csr,
+        parameter,
+        chunk_size=50,
+        return_distance=True,
+        **compute_parameters,
+    )
+    ASSERT_RESULT[(Dispatcher, dtype)](
+        dist_dense,
+        dist,
+        indices_dense,
+        indices,
+        **check_parameters,
+    )
+
+    dist, indices = Dispatcher.compute(
+        X_csr,
+        Y,
+        parameter,
+        chunk_size=50,
+        return_distance=True,
+        **compute_parameters,
+    )
+    ASSERT_RESULT[(Dispatcher, dtype)](
+        dist_dense,
+        dist,
+        indices_dense,
+        indices,
+        **check_parameters,
+    )
+
+    dist, indices = Dispatcher.compute(
+        X,
+        Y_csr,
+        parameter,
+        chunk_size=50,
+        return_distance=True,
+        **compute_parameters,
+    )
+    ASSERT_RESULT[(Dispatcher, dtype)](
+        dist_dense,
+        dist,
+        indices_dense,
+        indices,
+        **check_parameters,
+    )
 
 
 @pytest.mark.parametrize(