diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index ce579874f886c..c3f8de1e55408 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -76,6 +76,42 @@ Changes impacting all modules by :user:`John Pangas `, :user:`Rahil Parikh ` , and :user:`Alex Buzenet `. +- |Enhancement| Added a multi-threaded Cython routine to the compute squared + Euclidean distances (sometimes followed by a fused reduction operation) for a + pair of datasets consisting of a sparse CSR matrix and a dense NumPy. + + This can improve the performance of following functions and estimators: + + - :func:`sklearn.metrics.pairwise_distances_argmin` + - :func:`sklearn.metrics.pairwise_distances_argmin_min` + - :class:`sklearn.cluster.AffinityPropagation` + - :class:`sklearn.cluster.Birch` + - :class:`sklearn.cluster.MeanShift` + - :class:`sklearn.cluster.OPTICS` + - :class:`sklearn.cluster.SpectralClustering` + - :func:`sklearn.feature_selection.mutual_info_regression` + - :class:`sklearn.neighbors.KNeighborsClassifier` + - :class:`sklearn.neighbors.KNeighborsRegressor` + - :class:`sklearn.neighbors.RadiusNeighborsClassifier` + - :class:`sklearn.neighbors.RadiusNeighborsRegressor` + - :class:`sklearn.neighbors.LocalOutlierFactor` + - :class:`sklearn.neighbors.NearestNeighbors` + - :class:`sklearn.manifold.Isomap` + - :class:`sklearn.manifold.LocallyLinearEmbedding` + - :class:`sklearn.manifold.TSNE` + - :func:`sklearn.manifold.trustworthiness` + - :class:`sklearn.semi_supervised.LabelPropagation` + - :class:`sklearn.semi_supervised.LabelSpreading` + + A typical example of this performance improvement happens when passing a sparse + CSR matrix to the `predict` or `transform` method of estimators that rely on + a dense NumPy representation to store their fitted parameters (or the reverse). + + For instance, :meth:`sklearn.NearestNeighbors.kneighbors` is now up to 2 times faster + for this case on commonly available laptops. + + :pr:`25044` by :user:`Julien Jerphanion `. + Changelog --------- diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 246fa90f532fd..3b2bce128dcb6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -61,10 +61,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): No instance should directly be created outside of this class method. """ - if ( - metric in ("euclidean", "sqeuclidean") - and not (issparse(X) ^ issparse(Y)) # "^" is the XOR operator - ): + if metric in ("euclidean", "sqeuclidean"): # Specialized implementation of ArgKmin for the Euclidean distance # for the dense-dense and sparse-sparse cases. # This implementation computes the distances by chunk using diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 62403d1c334f0..576cc64ff5295 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -119,26 +119,7 @@ def is_valid_sparse_matrix(X): and metric in cls.valid_metrics() ) - # The other joblib-based back-end might be more efficient on fused sparse-dense - # datasets' pairs on metric="(sq)euclidean" for some configurations because it - # uses the Squared Euclidean matrix decomposition, i.e.: - # - # ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² - # - # calling efficient sparse-dense routines for matrix and vectors multiplication - # implemented in SciPy we do not use yet here. - # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa - # TODO: implement specialisation for (sq)euclidean on fused sparse-dense - # using sparse-dense routines for matrix-vector multiplications. - # Currently, only dense-dense and sparse-sparse are optimized for - # the Euclidean case. - fused_sparse_dense_euclidean_case_guard = not ( - (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y)) # "^" is XOR - and isinstance(metric, str) - and "euclidean" in metric - ) - - return is_usable and fused_sparse_dense_euclidean_case_guard + return is_usable @classmethod @abstractmethod diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp index da158199201f2..6b116f0f44d6f 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp @@ -186,4 +186,45 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam ) noexcept nogil +cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + cdef: + const DTYPE_t[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + + const {{INPUT_DTYPE_t}}[:, ::1] Y + + # We treat the dense-sparse case with the sparse-dense case by simply + # treating the dist_middle_terms as F-ordered and by swapping arguments. + # This attribute is meant to encode the case and adapt the logic + # accordingly. + bint c_ordered_middle_term + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) noexcept nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) noexcept nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) noexcept nogil + {{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index 79a15d0fd6e26..255efc83565d5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -73,6 +73,34 @@ cdef void _middle_term_sparse_sparse_64( {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +cdef void _middle_term_sparse_dense_{{name_suffix}}( + const DTYPE_t[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const {{INPUT_DTYPE_t}}[:, ::1] Y, + ITYPE_t Y_start, + ITYPE_t Y_end, + bint c_ordered_middle_term, + DTYPE_t * dist_middle_terms, +) nogil: + # This routine assumes that dist_middle_terms is a pointer to the first element + # of a buffer filled with zeros of length at least equal to n_X × n_Y, conceptually + # representing a 2-d C-ordered of F-ordered array. + cdef: + ITYPE_t i, j, k + ITYPE_t n_X = X_end - X_start + ITYPE_t n_Y = Y_end - Y_start + ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr + + for i in range(n_X): + for j in range(n_Y): + k = i * n_Y + j if c_ordered_middle_term else j * n_X + i + for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): + X_i_col_idx = X_indices[X_i_ptr] + dist_middle_terms[k] += -2 * X_data[X_i_ptr] * Y[Y_start + j, X_i_col_idx] + cdef class MiddleTermComputer{{name_suffix}}: """Helper class to compute a Euclidean distance matrix in chunks. @@ -103,7 +131,7 @@ cdef class MiddleTermComputer{{name_suffix}}: n_features, chunk_size, ) -> MiddleTermComputer{{name_suffix}}: - """Return the DatasetsPair implementation for the given arguments. + """Return the MiddleTermComputer implementation for the given arguments. Parameters ---------- @@ -143,12 +171,39 @@ cdef class MiddleTermComputer{{name_suffix}}: n_features, chunk_size, ) - + if X_is_sparse and not Y_is_sparse: + return SparseDenseMiddleTermComputer{{name_suffix}}( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + c_ordered_middle_term=True + ) + if not X_is_sparse and Y_is_sparse: + # NOTE: The Dense-Sparse case is implement via the Sparse-Dense case. + # + # To do so: + # - X (dense) and Y (sparse) are swapped + # - the distance middle term is seen as F-ordered for consistency + # (c_ordered_middle_term = False) + return SparseDenseMiddleTermComputer{{name_suffix}}( + # Mind that X and Y are swapped here. + Y, + X, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + c_ordered_middle_term=False, + ) raise NotImplementedError( - "X and Y must be both CSR sparse matrices or both numpy arrays." + "X and Y must be CSR sparse matrices or numpy arrays." ) - @classmethod def unpack_csr_matrix(cls, X: csr_matrix): """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" @@ -486,5 +541,101 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam return dist_middle_terms +cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + """Middle term of the Euclidean distance between chunks of a CSR matrix and a np.ndarray. + + The logic of the computation is wrapped in the routine _middle_term_sparse_dense_{{name_suffix}}. + This routine iterates over the data, indices and indptr arrays of the sparse matrices + without densifying them. + """ + + def __init__( + self, + X, + Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + bint c_ordered_middle_term, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y = Y + self.c_ordered_middle_term = c_ordered_middle_term + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) noexcept nogil: + # Fill the thread's dist_middle_terms_chunks with 0.0 before + # computing its elements in _compute_dist_middle_terms. + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) noexcept nogil: + # Fill the thread's dist_middle_terms_chunks with 0.0 before + # computing its elements in _compute_dist_middle_terms. + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) noexcept nogil: + cdef: + DTYPE_t *dist_middle_terms = ( + self.dist_middle_terms_chunks[thread_num].data() + ) + + # For the dense-sparse case, we use the sparse-dense case + # with dist_middle_terms seen as F-ordered. + # Hence we swap indices pointers here. + if not self.c_ordered_middle_term: + X_start, Y_start = Y_start, X_start + X_end, Y_end = Y_end, X_end + + _middle_term_sparse_dense_{{name_suffix}}( + self.X_data, + self.X_indices, + self.X_indptr, + X_start, + X_end, + self.Y, + Y_start, + Y_end, + self.c_ordered_middle_term, + dist_middle_terms, + ) + + return dist_middle_terms {{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 59c05cb90c839..6b9492e6a81f1 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -82,10 +82,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) No instance should directly be created outside of this class method. """ - if ( - metric in ("euclidean", "sqeuclidean") - and not (issparse(X) ^ issparse(Y)) # "^" is XOR - ): + if metric in ("euclidean", "sqeuclidean"): # Specialized implementation of RadiusNeighbors for the Euclidean # distance for the dense-dense and sparse-sparse cases. # This implementation computes the distances by chunk using diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 4fe8013cd3602..ad0ddbc60e9bd 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -553,15 +553,11 @@ def test_pairwise_distances_reduction_is_usable_for(): np.asfortranarray(X), Y, metric ) - # We prefer not to use those implementations for fused sparse-dense when - # metric="(sq)euclidean" because it's not yet the most efficient one on - # all configurations of datasets. - # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa - # TODO: implement specialisation for (sq)euclidean on fused sparse-dense - # using sparse-dense routines for matrix-vector multiplications. - assert not BaseDistancesReductionDispatcher.is_usable_for( - X_csr, Y, metric="euclidean" + assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric="euclidean") + assert BaseDistancesReductionDispatcher.is_usable_for( + X, Y_csr, metric="sqeuclidean" ) + assert BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y_csr, metric="sqeuclidean" ) @@ -1060,7 +1056,7 @@ def test_pairwise_distances_argkmin( row_idx, argkmin_indices_ref[row_idx] ] - for _X, _Y in [(X, Y), (X_csr, Y_csr)]: + for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)): argkmin_distances, argkmin_indices = ArgKmin.compute( _X, _Y,