diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 9191efae2a8da..9606eb1273ce8 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -3,16 +3,89 @@ # # Author: Julien Jerphanion # +# Overview +# -------- # -# The abstractions defined here are used in various algorithms performing -# the same structure of operations on distances between row vectors -# of a datasets pair (X, Y). +# This module provides routines to compute pairwise distances between a set +# of row vectors of X and another set of row vectors of Y and apply a +# reduction on top. The canonical example is the brute-force computation +# of the top k nearest neighbors by leveraging the arg-k-min reduction. # -# Importantly, the core of the computation is chunked to make sure that the pairwise -# distance chunk matrices stay in CPU cache before applying the final reduction step. -# Furthermore, the chunking strategy is also used to leverage OpenMP-based parallelism -# (using Cython prange loops) which gives another multiplicative speed-up in -# favorable cases on many-core machines. +# The reduction takes a matrix of pairwise distances between rows of X and Y +# as input and outputs an aggregate data-structure for each row of X. The +# aggregate values are typically smaller than the number of rows in Y, hence +# the term reduction. +# +# For computational reasons, the reduction are performed on the fly on chunks +# of rows of X and Y so as to keep intermediate data-structures in CPU cache +# and avoid unnecessary round trips of large distance arrays with the RAM +# that would otherwise severely degrade the speed by making the overall +# processing memory-bound. +# +# Finally, the routines follow a generic parallelization template to process +# chunks of data with OpenMP loops (via Cython prange), either on rows of X +# or rows of Y depending on their respective sizes. +# +# +# Dispatching to specialized implementations +# ------------------------------------------ +# +# Dispatchers are meant to be used in the Python code. Under the hood, a +# dispatcher must only define the logic to choose at runtime to the correct +# dtype-specialized :class:`PairwiseDistancesReduction` implementation based +# on the dtype of X and of Y. +# +# +# High-level diagram +# ------------------ +# +# Legend: +# +# A ---⊳ B: A inherits from B +# A ---x B: A dispatches on B +# +# +# (base dispatcher) +# PairwiseDistancesReduction +# ∆ +# | +# | +# +-----------------+-----------------+ +# | | +# (dispatcher) (dispatcher) +# PairwiseDistancesArgKmin PairwiseDistancesRadiusNeighbors +# | | +# | | +# | | +# | (64bit implem.) | +# | PairwiseDistancesReduction64 | +# | ∆ | +# | | | +# | | | +# | +-----------------+-----------------+ | +# | | | | +# | | | | +# x | | x +# PairwiseDistancesArgKmin64 PairwiseDistancesRadiusNeighbors64 +# | ∆ ∆ | +# | | | | +# x | | | +# FastEuclideanPairwiseDistancesArgKmin64 | | +# | | +# | x +# FastEuclideanPairwiseDistancesRadiusNeighbors64 +# +# For instance :class:`PairwiseDistancesArgKmin`, dispatches to +# :class:`PairwiseDistancesArgKmin64` if X and Y are both dense NumPy arrays +# with a float64 dtype. +# +# In addition, if the metric parameter is set to "euclidean" or "sqeuclidean", +# :class:`PairwiseDistancesArgKmin64` further dispatches to +# :class:`FastEuclideanPairwiseDistancesArgKmin64` a specialized subclass +# to optimally handle the Euclidean distance case using the Generalized Matrix +# Multiplication (see the docstring of :class:`GEMMTermComputer64` for details). +from abc import abstractmethod + cimport numpy as cnp import numpy as np import warnings @@ -25,7 +98,6 @@ from libcpp.vector cimport vector from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange -from cpython.ref cimport Py_INCREF from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair from ..utils._cython_blas cimport ( @@ -53,7 +125,6 @@ from ..utils.fixes import threadpool_limits from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils._typedefs import ITYPE, DTYPE - cnp.import_array() # TODO: change for `libcpp.algorithm.move` once Cython 3 is used @@ -82,8 +153,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( """Coerce a std::vector of std::vector to a ndarray of ndarray.""" cdef: ITYPE_t n = deref(vecs).size() - cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, - dtype=np.ndarray) + cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray) for i in range(n): nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i])) @@ -91,8 +161,351 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( return nd_arrays_of_nd_arrays ##################### +# Dispatchers + +class PairwiseDistancesReduction: + """Abstract base dispatcher for pairwise distance computation & reduction. + + Each dispatcher extending the base :class:`PairwiseDistancesReduction` + dispatcher must implement the :meth:`compute` classmethod. + """ + + @classmethod + def valid_metrics(cls) -> List[str]: + excluded = { + "pyfunc", # is relatively slow because we need to coerce data as np arrays + "mahalanobis", # is numerically unstable + # TODO: In order to support discrete distance metrics, we need to have a + # stable simultaneous sort which preserves the order of the input. + # The best might be using std::stable_sort and a Comparator taking an + # Arrays of Structures instead of Structure of Arrays (currently used). + "hamming", + *BOOL_METRICS, + } + return sorted(set(METRIC_MAPPING.keys()) - excluded) + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + """Return True if the PairwiseDistancesReduction can be used for the + given parameters. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) + Input data. + + Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) + Input data. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + Returns + ------- + True if the PairwiseDistancesReduction can be used, else False. + """ + dtypes_validity = X.dtype == Y.dtype and Y.dtype == np.float64 + return (get_config().get("enable_cython_pairwise_dist", True) and + not issparse(X) and not issparse(Y) and dtypes_validity and + metric in cls.valid_metrics()) + + @classmethod + @abstractmethod + def compute( + cls, + X, + Y, + **kwargs, + ): + """Compute the reduction. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + **kwargs : additional parameters for the reduction + + Notes + ----- + This method is an abstract class method: it has to be implemented + for all subclasses. + """ + +class PairwiseDistancesArgKmin(PairwiseDistancesReduction): + """Compute the argkmin of row vectors of X on the ones of Y. + + For each row vector of X, computes the indices of k first the rows + vectors of Y with the smallest distances. + + PairwiseDistancesArgKmin is typically used to perform + bruteforce k-nearest neighbors queries. + + This class is not meant to be instanciated, one should only use + its :meth:`compute` classmethod which handles allocation and + deallocation consistently. + """ + + @classmethod + def compute( + cls, + X, + Y, + k, + metric="euclidean", + chunk_size=None, + metric_kwargs=None, + strategy=None, + return_distance=False, + ): + """Compute the argkmin reduction. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + k : int + The k for the argkmin reduction. + + metric : str, default='euclidean' + The distance metric to use for argkmin. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` + is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity + for parallelism and is therefore more efficient despite the synchronization + step at each iteration of the outer loop on chunks of `X`. + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + return_distance : boolean, default=False + Return distances between each X vector and its + argkmin if set to True. + + Returns + ------- + If return_distance=False: + - argkmin_indices : ndarray of shape (n_samples_X, k) + Indices of the argkmin for each vector in X. + + If return_distance=True: + - argkmin_distances : ndarray of shape (n_samples_X, k) + Distances to the argkmin for each vector in X. + - argkmin_indices : ndarray of shape (n_samples_X, k) + Indices of the argkmin for each vector in X. + + Notes + ----- + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`PairwiseDistancesArgKmin`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + """ + # Note (jjerphan): Some design thoughts for future extensions. + # This factory comes to handle specialisations for the given arguments. + # For future work, this might can be an entrypoint to specialise operations + # for various backend and/or hardware and/or datatypes, and/or fused + # {sparse, dense}-datasetspair etc. + if X.dtype == Y.dtype == np.float64: + return PairwiseDistancesArgKmin64.compute( + X=X, + Y=Y, + k=k, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + return_distance=return_distance, + ) + raise ValueError( + f"Only 64bit float datasets are supported at this time, " + f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." + ) + + +class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): + """Compute radius-based neighbors for two sets of vectors. + + For each row-vector X[i] of the queries X, find all the indices j of + row-vectors in Y such that: + + dist(X[i], Y[j]) <= radius + + The distance function `dist` depends on the values of the `metric` + and `metric_kwargs` parameters. + + This class is not meant to be instanciated, one should only use + its :meth:`compute` classmethod which handles allocation and + deallocation consistently. + """ + + @classmethod + def compute( + cls, + X, + Y, + radius, + metric="euclidean", + chunk_size=None, + metric_kwargs=None, + strategy=None, + return_distance=False, + sort_results=False, + ): + """Return the results of the reduction for the given arguments. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + radius : float + The radius defining the neighborhood. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` + is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity + for parallelism and is therefore more efficient despite the synchronization + step at each iteration of the outer loop on chunks of `X`. + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + return_distance : boolean, default=False + Return distances between each X vector and its neighbors if set to True. + + sort_results : boolean, default=False + Sort results with respect to distances between each X vector and its + neighbors if set to True. + + Returns + ------- + If return_distance=False: + - neighbors_indices : ndarray of n_samples_X ndarray + Indices of the neighbors for each vector in X. + + If return_distance=True: + - neighbors_indices : ndarray of n_samples_X ndarray + Indices of the neighbors for each vector in X. + - neighbors_distances : ndarray of n_samples_X ndarray + Distances to the neighbors for each vector in X. + + Notes + ----- + This public classmethod is responsible for introspecting the arguments + values to dispatch to the private dtype-specialized implementation of + :class:`PairwiseDistancesRadiusNeighborhood`. + + All temporarily allocated datastructures necessary for the concrete + implementation are therefore freed when this classmethod returns. + + This allows entirely decoupling the API entirely from the + implementation details whilst maintaining RAII. + """ + # Note (jjerphan): Some design thoughts for future extensions. + # This factory comes to handle specialisations for the given arguments. + # For future work, this might can be an entrypoint to specialise operations + # for various backend and/or hardware and/or datatypes, and/or fused + # {sparse, dense}-datasetspair etc. + if X.dtype == Y.dtype == np.float64: + return PairwiseDistancesRadiusNeighborhood64.compute( + X=X, + Y=Y, + radius=radius, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + sort_results=sort_results, + return_distance=return_distance, + ) + raise ValueError( + f"Only 64bit float datasets are supported at this time, " + f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." + ) + +##################### +# dtype-specialized implementations -cpdef DTYPE_t[::1] _sqeuclidean_row_norms( +cpdef DTYPE_t[::1] _sqeuclidean_row_norms64( const DTYPE_t[:, ::1] X, ITYPE_t num_threads, ): @@ -106,93 +519,163 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms( # const qualifier. # See: https://github.com/scipy/scipy/issues/14262 DTYPE_t * X_ptr = &X[0, 0] - ITYPE_t idx = 0 + ITYPE_t i = 0 ITYPE_t n = X.shape[0] ITYPE_t d = X.shape[1] DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE) - for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): - squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1) + for i in prange(n, schedule='static', nogil=True, num_threads=num_threads): + squared_row_norms[i] = _dot(d, X_ptr + i * d, 1, X_ptr + i * d, 1) return squared_row_norms -##################### +cdef class GEMMTermComputer64: + """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM. + + `FastEuclidean*` classes internally compute the squared Euclidean distances between + chunks of vectors X_c and Y_c using the following decomposition: + + + ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² -cdef class PairwiseDistancesReduction: - """Abstract base class for pairwise distance computation & reduction. - - Subclasses of this class compute pairwise distances between a set of - row vectors of X and another set of row vectors of Y and apply a reduction on top. - The reduction takes a matrix of pairwise distances between rows of X and Y - as input and outputs an aggregate data-structure for each row of X. - The aggregate values are typically smaller than the number of rows in Y, - hence the term reduction. - - For computational reasons, it is interesting to perform the reduction on - the fly on chunks of rows of X and Y so as to keep intermediate - data-structures in CPU cache and avoid unnecessary round trips of large - distance arrays with the RAM that would otherwise severely degrade the - speed by making the overall processing memory-bound. - - The base class provides the generic chunked parallelization template using - OpenMP loops (Cython prange), either on rows of X or rows of Y depending on - their respective sizes. - - The subclasses are specialized for reduction. - - The actual distance computation for a given pair of rows of X and Y are - delegated to format-specific subclasses of the DatasetsPair companion base - class. - - Parameters - ---------- - datasets_pair: DatasetsPair - The pair of dataset to use. - - chunk_size: int, default=None - The number of vectors per chunk. If None (default) looks-up in - scikit-learn configuration for `pairwise_dist_chunk_size`, - and use 256 if it is not set. - - strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None - The chunking strategy defining which dataset parallelization are made on. - - For both strategies the computations happens with two nested loops, - respectively on chunks of X and chunks of Y. - Strategies differs on which loop (outer or inner) is made to run - in parallel with the Cython `prange` construct: - - - 'parallel_on_X' dispatches chunks of X uniformly on threads. - Each thread then iterates on all the chunks of Y. This strategy is - embarrassingly parallel and comes with no datastructures synchronisation. - - - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. - Each thread processes all the chunks of X in turn. This strategy is - a sequence of embarrassingly parallel subtasks (the inner loop on Y - chunks) with intermediate datastructures synchronisation at each - iteration of the sequential outer loop on X chunks. - - - 'auto' relies on a simple heuristic to choose between - 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, - 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` - is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity - for parallelism and is therefore more efficient despite the synchronization - step at each iteration of the outer loop on chunks of `X`. - - - None (default) looks-up in scikit-learn configuration for - `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + This helper class is in charge of wrapping the common logic to compute + the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high + arithmetic intensity. """ + cdef: + const DTYPE_t[:, ::1] X + const DTYPE_t[:, ::1] Y + + ITYPE_t effective_n_threads + ITYPE_t chunks_n_threads + ITYPE_t dist_middle_terms_chunks_size + ITYPE_t n_features + ITYPE_t chunk_size + + # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM + vector[vector[DTYPE_t]] dist_middle_terms_chunks + + def __init__(self, + DTYPE_t[:, ::1] X, + DTYPE_t[:, ::1] Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + self.X = X + self.Y = Y + self.effective_n_threads = effective_n_threads + self.chunks_n_threads = chunks_n_threads + self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size + self.n_features = n_features + self.chunk_size = chunk_size + + self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) + + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return + + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: + self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_init(self) nogil: + for thread_num in range(self.chunks_n_threads): + self.dist_middle_terms_chunks[thread_num].resize( + self.dist_middle_terms_chunks_size + ) + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil: + return + + cdef DTYPE_t * _compute_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + DTYPE_t squared_dist_i_j + const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :] + const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] + DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() + + # Careful: LDA, LDB and LDC are given for F-ordered arrays + # in BLAS documentations, for instance: + # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa + # + # Here, we use their counterpart values to work with C-ordered arrays. + BLAS_Order order = RowMajor + BLAS_Trans ta = NoTrans + BLAS_Trans tb = Trans + ITYPE_t m = X_c.shape[0] + ITYPE_t n = Y_c.shape[0] + ITYPE_t K = X_c.shape[1] + DTYPE_t alpha = - 2. + # Casting for A and B to remove the const is needed because APIs exposed via + # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. + # See: https://github.com/scipy/scipy/issues/14262 + DTYPE_t * A = &X_c[0, 0] + DTYPE_t * B = &Y_c[0, 0] + ITYPE_t lda = X_c.shape[1] + ITYPE_t ldb = X_c.shape[1] + DTYPE_t beta = 0. + ITYPE_t ldc = Y_c.shape[0] + + # dist_middle_terms = `-2 * X_c @ Y_c.T` + _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) + + return dist_middle_terms + +cdef class PairwiseDistancesReduction64: + """Base 64bit implementation of PairwiseDistancesReduction.""" cdef: readonly DatasetsPair datasets_pair # The number of threads that can be used is stored in effective_n_threads. # - # The number of threads to use in the parallelisation strategy + # The number of threads to use in the parallelization strategy # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads: - # for small datasets, less threads might be needed to loop over pair of chunks. + # for small datasets, fewer threads might be needed to loop over pair of chunks. # - # Hence the number of threads that _will_ be used for looping over chunks + # Hence, the number of threads that _will_ be used for looping over chunks # is stored in chunks_n_threads, allowing solely using what we need. # # Thus, an invariant is: @@ -209,47 +692,6 @@ cdef class PairwiseDistancesReduction: bint execute_in_parallel_on_Y - @classmethod - def valid_metrics(cls) -> List[str]: - excluded = { - "pyfunc", # is relatively slow because we need to coerce data as np arrays - "mahalanobis", # is numerically unstable - # TODO: In order to support discrete distance metrics, we need to have a - # stable simultaneous sort which preserves the order of the input. - # The best might be using std::stable_sort and a Comparator taking an - # Arrays of Structures instead of Structure of Arrays (currently used). - "hamming", - *BOOL_METRICS, - } - return sorted(set(METRIC_MAPPING.keys()) - excluded) - - @classmethod - def is_usable_for(cls, X, Y, metric) -> bool: - """Return True if the PairwiseDistancesReduction can be used for the given parameters. - - Parameters - ---------- - X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) - Input data. - - Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) - Input data. - - metric : str, default='euclidean' - The distance metric to use. - For a list of available metrics, see the documentation of - :class:`~sklearn.metrics.DistanceMetric`. - - Returns - ------- - True if the PairwiseDistancesReduction can be used, else False. - """ - # TODO: support sparse arrays and 32 bits - return (get_config().get("enable_cython_pairwise_dist", True) and - not issparse(X) and X.dtype == np.float64 and - not issparse(Y) and Y.dtype == np.float64 and - metric in cls.valid_metrics()) - def __init__( self, DatasetsPair datasets_pair, @@ -348,7 +790,8 @@ cdef class PairwiseDistancesReduction: X_end = X_start + self.X_n_samples_chunk # Reinitializing thread datastructures for the new X chunk - self._parallel_on_X_init_chunk(thread_num, X_start) + # If necessary, upcast X[X_start:X_end] to 64bit + self._parallel_on_X_init_chunk(thread_num, X_start, X_end) for Y_chunk_idx in range(self.Y_n_chunks): Y_start = Y_chunk_idx * self.Y_n_samples_chunk @@ -357,6 +800,13 @@ cdef class PairwiseDistancesReduction: else: Y_end = Y_start + self.Y_n_samples_chunk + # If necessary, upcast Y[Y_start:Y_end] to 64bit + self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self._compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, @@ -409,7 +859,8 @@ cdef class PairwiseDistancesReduction: thread_num = _openmp_thread_num() # Initializing datastructures used in this thread - self._parallel_on_Y_parallel_init(thread_num) + # If necessary, upcast X[X_start:X_end] to 64bit + self._parallel_on_Y_parallel_init(thread_num, X_start, X_end) for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'): Y_start = Y_chunk_idx * self.Y_n_samples_chunk @@ -418,6 +869,13 @@ cdef class PairwiseDistancesReduction: else: Y_end = Y_start + self.Y_n_samples_chunk + # If necessary, upcast Y[Y_start:Y_end] to 64bit + self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self._compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, @@ -450,8 +908,9 @@ cdef class PairwiseDistancesReduction: ) nogil: """Compute the pairwise distances on two chunks of X and Y and reduce them. - This is THE core computational method of PairwiseDistanceReductions. - This must be implemented in subclasses. + This is THE core computational method of PairwiseDistanceReductions64. + This must be implemented in subclasses agnostically from the parallelization + strategies. """ return @@ -475,12 +934,27 @@ cdef class PairwiseDistancesReduction: """Allocate datastructures used in a thread given its number.""" return - cdef void _parallel_on_X_init_chunk( + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + """Initialise datastructures used in a thread given its number.""" + return + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( self, - ITYPE_t thread_num, ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, ) nogil: - """Initialise datastructures used in a thread given its number.""" + """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks. + + This is eventually used to upcast X[X_start:X_end] to 64bit. + """ return cdef void _parallel_on_X_prange_iter_finalize( @@ -508,10 +982,26 @@ cdef class PairwiseDistancesReduction: cdef void _parallel_on_Y_parallel_init( self, ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, ) nogil: """Initialise datastructures used in a thread given its number.""" return + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks. + + This is eventually used to upcast Y[Y_start:Y_end] to 64bit. + """ + return + cdef void _parallel_on_Y_synchronize( self, ITYPE_t X_start, @@ -526,28 +1016,8 @@ cdef class PairwiseDistancesReduction: """Update datastructures after executing all the reductions.""" return -cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): - """Compute the argkmin of row vectors of X on the ones of Y. - - For each row vector of X, computes the indices of k first the rows - vectors of Y with the smallest distances. - - PairwiseDistancesArgKmin is typically used to perform - bruteforce k-nearest neighbors queries. - - Parameters - ---------- - datasets_pair: DatasetsPair - The dataset pairs (X, Y) for the reduction. - - chunk_size: int, default=None, - The number of vectors per chunk. If None (default) looks-up in - scikit-learn configuration for `pairwise_dist_chunk_size`, - and use 256 if it is not set. - - k: int, default=1 - The k for the argkmin reduction. - """ +cdef class PairwiseDistancesArgKmin64(PairwiseDistancesReduction64): + """64bit implementation of PairwiseDistancesArgKmin.""" cdef: ITYPE_t k @@ -571,94 +1041,19 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): str strategy=None, bint return_distance=False, ): - """Return the results of the reduction for the given arguments. - - Parameters - ---------- - X : ndarray or CSR matrix of shape (n_samples_X, n_features) - Input data. - - Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) - Input data. - - k : int - The k for the argkmin reduction. - - metric : str, default='euclidean' - The distance metric to use for argkmin. - For a list of available metrics, see the documentation of - :class:`~sklearn.metrics.DistanceMetric`. - - chunk_size : int, default=None, - The number of vectors per chunk. If None (default) looks-up in - scikit-learn configuration for `pairwise_dist_chunk_size`, - and use 256 if it is not set. - - metric_kwargs : dict, default=None - Keyword arguments to pass to specified metric function. - - strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None - The chunking strategy defining which dataset parallelization are made on. - - For both strategies the computations happens with two nested loops, - respectively on chunks of X and chunks of Y. - Strategies differs on which loop (outer or inner) is made to run - in parallel with the Cython `prange` construct: - - - 'parallel_on_X' dispatches chunks of X uniformly on threads. - Each thread then iterates on all the chunks of Y. This strategy is - embarrassingly parallel and comes with no datastructures synchronisation. - - - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. - Each thread processes all the chunks of X in turn. This strategy is - a sequence of embarrassingly parallel subtasks (the inner loop on Y - chunks) with intermediate datastructures synchronisation at each - iteration of the sequential outer loop on X chunks. - - - 'auto' relies on a simple heuristic to choose between - 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, - 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` - is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity - for parallelism and is therefore more efficient despite the synchronization - step at each iteration of the outer loop on chunks of `X`. - - - None (default) looks-up in scikit-learn configuration for - `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. - - return_distance : boolean, default=False - Return distances between each X vector and its - argkmin if set to True. - - Returns - ------- - If return_distance=False: - - argkmin_indices : ndarray of shape (n_samples_X, k) - Indices of the argkmin for each vector in X. - - If return_distance=True: - - argkmin_distances : ndarray of shape (n_samples_X, k) - Distances to the argkmin for each vector in X. - - argkmin_indices : ndarray of shape (n_samples_X, k) - Indices of the argkmin for each vector in X. + """Compute the argkmin reduction. - Notes - ----- - This public classmethod is responsible for introspecting the arguments - values to dispatch to the private :meth:`PairwiseDistancesArgKmin._compute` - instance method of the most appropriate :class:`PairwiseDistancesArgKmin` - concrete implementation. + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`PairwiseDistancesArgKmin64`. - All temporarily allocated datastructures necessary for the concrete - implementation are therefore freed when this classmethod returns. + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. - This allows entirely decoupling the interface entirely from the - implementation details whilst maintaining RAII. + No instance should directly be created outside of this class method. """ - # Note (jjerphan): Some design thoughts for future extensions. - # This factory comes to handle specialisations for the given arguments. - # For future work, this might can be an entrypoint to specialise operations - # for various backend and/or hardware and/or datatypes, and/or fused - # {sparse, dense}-datasetspair etc. if ( metric in ("euclidean", "sqeuclidean") and not issparse(X) @@ -669,7 +1064,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): # at time to leverage a call to the BLAS GEMM routine as explained # in more details in the docstring. use_squared_distances = metric == "sqeuclidean" - pda = FastEuclideanPairwiseDistancesArgKmin( + pda = FastEuclideanPairwiseDistancesArgKmin64( X=X, Y=Y, k=k, use_squared_distances=use_squared_distances, chunk_size=chunk_size, @@ -679,7 +1074,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): else: # Fall back on a generic implementation that handles most scipy # metrics by computing the distances between 2 vectors at a time. - pda = PairwiseDistancesArgKmin( + pda = PairwiseDistancesArgKmin64( datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), k=k, chunk_size=chunk_size, @@ -726,7 +1121,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): sizeof(ITYPE_t *) * self.chunks_n_threads ) - # Main heaps which will be returned as results by `PairwiseDistancesArgKmin.compute`. + # Main heaps which will be returned as results by `PairwiseDistancesArgKmin64.compute`. self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE) self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE) @@ -764,11 +1159,11 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): Y_start + j, ) - @final cdef void _parallel_on_X_init_chunk( self, ITYPE_t thread_num, ITYPE_t X_start, + ITYPE_t X_end, ) nogil: # As this strategy is embarrassingly parallel, we can set each # thread's heaps pointer to the proper position on the main heaps. @@ -819,10 +1214,11 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): heaps_size * sizeof(ITYPE_t) ) - @final cdef void _parallel_on_Y_parallel_init( self, ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, ) nogil: # Initialising heaps (memset can't be used here) for idx in range(self.X_n_samples_chunk * self.k): @@ -839,185 +1235,76 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): ITYPE_t idx, jdx, thread_num with nogil, parallel(num_threads=self.effective_n_threads): # Synchronising the thread heaps with the main heaps. - # This is done in parallel sample-wise (no need for locks). - # - # This might break each thread's data locality as each heap which - # was allocated in a thread is being now being used in several threads. - # - # Still, this parallel pattern has shown to be efficient in practice. - for idx in prange(X_end - X_start, schedule="static"): - for thread_num in range(self.chunks_n_threads): - for jdx in range(self.k): - heap_push( - &self.argkmin_distances[X_start + idx, 0], - &self.argkmin_indices[X_start + idx, 0], - self.k, - self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx], - self.heaps_indices_chunks[thread_num][idx * self.k + jdx], - ) - - cdef void _parallel_on_Y_finalize( - self, - ) nogil: - cdef: - ITYPE_t idx, thread_num - - with nogil, parallel(num_threads=self.chunks_n_threads): - # Deallocating temporary datastructures - for thread_num in prange(self.chunks_n_threads, schedule='static'): - free(self.heaps_r_distances_chunks[thread_num]) - free(self.heaps_indices_chunks[thread_num]) - - # Sorting the main in ascending order w.r.t the distances. - # This is done in parallel sample-wise (no need for locks). - for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( - &self.argkmin_distances[idx, 0], - &self.argkmin_indices[idx, 0], - self.k, - ) - return - - cdef void compute_exact_distances(self) nogil: - cdef: - ITYPE_t i, j - ITYPE_t[:, ::1] Y_indices = self.argkmin_indices - DTYPE_t[:, ::1] distances = self.argkmin_distances - for i in prange(self.n_samples_X, schedule='static', nogil=True, - num_threads=self.effective_n_threads): - for j in range(self.k): - distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist( - # Guard against eventual -0., causing nan production. - max(distances[i, j], 0.) - ) - - def _finalize_results(self, bint return_distance=False): - if return_distance: - # We need to recompute distances because we relied on - # surrogate distances for the reduction. - self.compute_exact_distances() - - # Values are returned identically to the way `KNeighborsMixin.kneighbors` - # returns values. This is counter-intuitive but this allows not using - # complex adaptations where `PairwiseDistancesArgKmin.compute` is called. - return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices) - - return np.asarray(self.argkmin_indices) - - -cdef class GEMMTermComputer: - """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM. - - `FastEuclidean*` classes internally compute the squared Euclidean distances between - chunks of vectors X_c and Y_c using using the decomposition: - - - ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² - - - This helper class is in charge of wrapping the common logic to compute - the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high - arithmetic intensity. - """ - - cdef: - const DTYPE_t[:, ::1] X - const DTYPE_t[:, ::1] Y - - ITYPE_t effective_n_threads - ITYPE_t chunks_n_threads - ITYPE_t dist_middle_terms_chunks_size - - # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM - vector[vector[DTYPE_t]] dist_middle_terms_chunks - - def __init__(self, - DTYPE_t[:, ::1] X, - DTYPE_t[:, ::1] Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ): - self.X = X - self.Y = Y - self.effective_n_threads = effective_n_threads - self.chunks_n_threads = chunks_n_threads - self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size - - self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) - - cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: - self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) - - cdef void _parallel_on_Y_init(self) nogil: - for thread_num in range(self.chunks_n_threads): - self.dist_middle_terms_chunks[thread_num].resize( - self.dist_middle_terms_chunks_size - ) - - cdef DTYPE_t * _compute_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - ITYPE_t i, j - DTYPE_t squared_dist_i_j - - const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :] - const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] - DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() - - # Careful: LDA, LDB and LDC are given for F-ordered arrays - # in BLAS documentations, for instance: - # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa - # - # Here, we use their counterpart values to work with C-ordered arrays. - BLAS_Order order = RowMajor - BLAS_Trans ta = NoTrans - BLAS_Trans tb = Trans - ITYPE_t m = X_c.shape[0] - ITYPE_t n = Y_c.shape[0] - ITYPE_t K = X_c.shape[1] - DTYPE_t alpha = - 2. - # Casting for A and B to remove the const is needed because APIs exposed via - # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. - # See: https://github.com/scipy/scipy/issues/14262 - DTYPE_t * A = &X_c[0, 0] - ITYPE_t lda = X_c.shape[1] - DTYPE_t * B = &Y_c[0, 0] - ITYPE_t ldb = X_c.shape[1] - DTYPE_t beta = 0. - ITYPE_t ldc = Y_c.shape[0] + # This is done in parallel sample-wise (no need for locks). + # + # This might break each thread's data locality as each heap which + # was allocated in a thread is being now being used in several threads. + # + # Still, this parallel pattern has shown to be efficient in practice. + for idx in prange(X_end - X_start, schedule="static"): + for thread_num in range(self.chunks_n_threads): + for jdx in range(self.k): + heap_push( + &self.argkmin_distances[X_start + idx, 0], + &self.argkmin_indices[X_start + idx, 0], + self.k, + self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx], + self.heaps_indices_chunks[thread_num][idx * self.k + jdx], + ) - # dist_middle_terms = `-2 * X_c @ Y_c.T` - _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) + cdef void _parallel_on_Y_finalize( + self, + ) nogil: + cdef: + ITYPE_t idx, thread_num - return dist_middle_terms + with nogil, parallel(num_threads=self.chunks_n_threads): + # Deallocating temporary datastructures + for thread_num in prange(self.chunks_n_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + # Sorting the main in ascending order w.r.t the distances. + # This is done in parallel sample-wise (no need for locks). + for idx in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + &self.argkmin_distances[idx, 0], + &self.argkmin_indices[idx, 0], + self.k, + ) + return -cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): - """Fast specialized variant for PairwiseDistancesArgKmin on EuclideanDistance. + cdef void compute_exact_distances(self) nogil: + cdef: + ITYPE_t i, j + ITYPE_t[:, ::1] Y_indices = self.argkmin_indices + DTYPE_t[:, ::1] distances = self.argkmin_distances + for i in prange(self.n_samples_X, schedule='static', nogil=True, + num_threads=self.effective_n_threads): + for j in range(self.k): + distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist( + # Guard against eventual -0., causing nan production. + max(distances[i, j], 0.) + ) - The full pairwise squared distances matrix is computed as follows: + def _finalize_results(self, bint return_distance=False): + if return_distance: + # We need to recompute distances because we relied on + # surrogate distances for the reduction. + self.compute_exact_distances() - ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||² + # Values are returned identically to the way `KNeighborsMixin.kneighbors` + # returns values. This is counter-intuitive but this allows not using + # complex adaptations where `PairwiseDistancesArgKmin64.compute` is called. + return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices) - The middle term gets computed efficiently below using BLAS Level 3 GEMM. + return np.asarray(self.argkmin_indices) - Notes - ----- - This implementation has a superior arithmetic intensity and hence - better running time when the variant is IO bound, but it can suffer - from numerical instability caused by catastrophic cancellation potentially - introduced by the subtraction in the arithmetic expression above. - """ +cdef class FastEuclideanPairwiseDistancesArgKmin64(PairwiseDistancesArgKmin64): + """EuclideanDistance-specialized 64bit implementation for PairwiseDistancesArgKmin.""" cdef: - GEMMTermComputer gemm_term_computer + GEMMTermComputer64 gemm_term_computer const DTYPE_t[::1] X_norm_squared const DTYPE_t[::1] Y_norm_squared @@ -1025,7 +1312,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): @classmethod def is_usable_for(cls, X, Y, metric) -> bool: - return (PairwiseDistancesArgKmin.is_usable_for(X, Y, metric) and + return (PairwiseDistancesArgKmin64.is_usable_for(X, Y, metric) and not _in_unstable_openblas_configuration()) def __init__( @@ -1045,7 +1332,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): ): warnings.warn( f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't " - f"usable for this case ({self.__class__.__name__}) and will be ignored.", + f"usable for this case (FastEuclideanPairwiseDistancesArgKmin) and will be ignored.", UserWarning, stacklevel=3, ) @@ -1059,50 +1346,118 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): ) # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair cdef: - DenseDenseDatasetsPair datasets_pair = self.datasets_pair + DenseDenseDatasetsPair datasets_pair = ( + self.datasets_pair + ) ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - self.gemm_term_computer = GEMMTermComputer( + self.gemm_term_computer = GEMMTermComputer64( datasets_pair.X, datasets_pair.Y, self.effective_n_threads, self.chunks_n_threads, dist_middle_terms_chunks_size, + n_features=datasets_pair.X.shape[1], + chunk_size=self.chunk_size, ) if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") else: - self.Y_norm_squared = _sqeuclidean_row_norms(datasets_pair.Y, self.effective_n_threads) + self.Y_norm_squared = _sqeuclidean_row_norms64(datasets_pair.Y, self.effective_n_threads) # Do not recompute norms if datasets are identical. self.X_norm_squared = ( self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms(datasets_pair.X, self.effective_n_threads) + _sqeuclidean_row_norms64(datasets_pair.X, self.effective_n_threads) ) self.use_squared_distances = use_squared_distances @final cdef void compute_exact_distances(self) nogil: if not self.use_squared_distances: - PairwiseDistancesArgKmin.compute_exact_distances(self) + PairwiseDistancesArgKmin64.compute_exact_distances(self) @final cdef void _parallel_on_X_parallel_init( self, ITYPE_t thread_num, ) nogil: - PairwiseDistancesArgKmin._parallel_on_X_parallel_init(self, thread_num) + PairwiseDistancesArgKmin64._parallel_on_X_parallel_init(self, thread_num) self.gemm_term_computer._parallel_on_X_parallel_init(thread_num) + + @final + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + PairwiseDistancesArgKmin64._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) + self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) + + + @final + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesArgKmin64._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num, + ) + + @final cdef void _parallel_on_Y_init( self, ) nogil: cdef ITYPE_t thread_num - PairwiseDistancesArgKmin._parallel_on_Y_init(self) + PairwiseDistancesArgKmin64._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() + + @final + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + PairwiseDistancesArgKmin64._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) + self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) + + + @final + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesArgKmin64._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num + ) + + @final cdef void _compute_and_reduce_distances_on_chunks( self, @@ -1145,30 +1500,8 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): ) -cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): - """Compute radius-based neighbors for two sets of vectors. - - For each row-vector X[i] of the queries X, find all the indices j of - row-vectors in Y such that: - - dist(X[i], Y[j]) <= radius - - The distance function `dist` depends on the values of the `metric` - and `metric_kwargs` parameters. - - Parameters - ---------- - datasets_pair: DatasetsPair - The dataset pair (X, Y) for the reduction. - - chunk_size: int, default=None, - The number of vectors per chunk. If None (default) looks-up in - scikit-learn configuration for `pairwise_dist_chunk_size`, - and use 256 if it is not set. - - radius: float - The radius defining the neighborhood. - """ +cdef class PairwiseDistancesRadiusNeighborhood64(PairwiseDistancesReduction64): + """64bit implementation of PairwiseDistancesArgKmin.""" cdef: DTYPE_t radius @@ -1182,7 +1515,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays. # # For this implementation, we want resizable buffers which we will wrap - # into numpy arrays at the end. std::vector comes as a handy interface + # into numpy arrays at the end. std::vector comes as a handy container # for interacting efficiently with resizable buffers. # # Though it is possible to access their buffer address with @@ -1219,98 +1552,19 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): bint return_distance=False, bint sort_results=False, ): - """Return the results of the reduction for the given arguments. - - Parameters - ---------- - X : ndarray or CSR matrix of shape (n_samples_X, n_features) - Input data. - - Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) - Input data. + """Compute the radius-neighbors reduction. - radius : float - The radius defining the neighborhood. - - metric : str, default='euclidean' - The distance metric to use. - For a list of available metrics, see the documentation of - :class:`~sklearn.metrics.DistanceMetric`. - - chunk_size : int, default=None, - The number of vectors per chunk. If None (default) looks-up in - scikit-learn configuration for `pairwise_dist_chunk_size`, - and use 256 if it is not set. - - metric_kwargs : dict, default=None - Keyword arguments to pass to specified metric function. - - strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None - The chunking strategy defining which dataset parallelization are made on. - - For both strategies the computations happens with two nested loops, - respectively on chunks of X and chunks of Y. - Strategies differs on which loop (outer or inner) is made to run - in parallel with the Cython `prange` construct: - - - 'parallel_on_X' dispatches chunks of X uniformly on threads. - Each thread then iterates on all the chunks of Y. This strategy is - embarrassingly parallel and comes with no datastructures synchronisation. - - - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. - Each thread processes all the chunks of X in turn. This strategy is - a sequence of embarrassingly parallel subtasks (the inner loop on Y - chunks) with intermediate datastructures synchronisation at each - iteration of the sequential outer loop on X chunks. - - - 'auto' relies on a simple heuristic to choose between - 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, - 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` - is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity - for parallelism and is therefore more efficient despite the synchronization - step at each iteration of the outer loop on chunks of `X`. - - - None (default) looks-up in scikit-learn configuration for - `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. - - return_distance : boolean, default=False - Return distances between each X vector and its neighbors if set to True. - - sort_results : boolean, default=False - Sort results with respect to distances between each X vector and its - neighbors if set to True. - - Returns - ------- - If return_distance=False: - - neighbors_indices : ndarray of n_samples_X ndarray - Indices of the neighbors for each vector in X. - - If return_distance=True: - - neighbors_indices : ndarray of n_samples_X ndarray - Indices of the neighbors for each vector in X. - - neighbors_distances : ndarray of n_samples_X ndarray - Distances to the neighbors for each vector in X. - - Notes - ----- - This public classmethod is responsible for introspecting the arguments - values to dispatch to the private - :meth:`PairwiseDistancesRadiusNeighborhood._compute` instance method of - the most appropriate :class:`PairwiseDistancesRadiusNeighborhood` - concrete implementation. + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`PairwiseDistancesRadiusNeighborhood64`. - All temporarily allocated datastructures necessary for the concrete - implementation are therefore freed when this classmethod returns. + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. - This allows entirely decoupling the interface entirely from the - implementation details whilst maintaining RAII. + No instance should directly be created outside of this class method. """ - # Note (jjerphan): Some design thoughts for future extensions. - # This factory comes to handle specialisations for the given arguments. - # For future work, this might can be an entrypoint to specialise operations - # for various backend and/or hardware and/or datatypes, and/or fused - # {sparse, dense}-datasetspair etc. if ( metric in ("euclidean", "sqeuclidean") and not issparse(X) @@ -1321,7 +1575,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): # at time to leverage a call to the BLAS GEMM routine as explained # in more details in the docstring. use_squared_distances = metric == "sqeuclidean" - pda = FastEuclideanPairwiseDistancesRadiusNeighborhood( + pda = FastEuclideanPairwiseDistancesRadiusNeighborhood64( X=X, Y=Y, radius=radius, use_squared_distances=use_squared_distances, chunk_size=chunk_size, @@ -1332,7 +1586,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): else: # Fall back on a generic implementation that handles most scipy # metrics by computing the distances between 2 vectors at a time. - pda = PairwiseDistancesRadiusNeighborhood( + pda = PairwiseDistancesRadiusNeighborhood64( datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), radius=radius, chunk_size=chunk_size, @@ -1423,11 +1677,11 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): return coerce_vectors_to_nd_arrays(self.neigh_indices) - @final cdef void _parallel_on_X_init_chunk( self, ITYPE_t thread_num, ITYPE_t X_start, + ITYPE_t X_end, ) nogil: # As this strategy is embarrassingly parallel, we can set the @@ -1546,26 +1800,10 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): ) -cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRadiusNeighborhood): - """Fast specialized variant for PairwiseDistancesRadiusNeighborhood on EuclideanDistance. - - The full pairwise squared distances matrix is computed as follows: - - ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||² - - The middle term gets computed efficiently below using BLAS Level 3 GEMM. - - Notes - ----- - This implementation has a superior arithmetic intensity and hence - better running time when the variant is IO bound, but it can suffer - from numerical instability caused by catastrophic cancellation potentially - introduced by the subtraction in the arithmetic expression above. - numerical precision is needed. - """ - +cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood64(PairwiseDistancesRadiusNeighborhood64): + """EuclideanDistance-specialized 64bit implementation for PairwiseDistancesRadiusNeighborhood.""" cdef: - GEMMTermComputer gemm_term_computer + GEMMTermComputer64 gemm_term_computer const DTYPE_t[::1] X_norm_squared const DTYPE_t[::1] Y_norm_squared @@ -1573,7 +1811,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad @classmethod def is_usable_for(cls, X, Y, metric) -> bool: - return (PairwiseDistancesRadiusNeighborhood.is_usable_for(X, Y, metric) + return (PairwiseDistancesRadiusNeighborhood64.is_usable_for(X, Y, metric) and not _in_unstable_openblas_configuration()) def __init__( @@ -1594,7 +1832,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad ): warnings.warn( f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't " - f"usable for this case ({self.__class__.__name__}) and will be ignored.", + f"usable for this case (FastEuclideanPairwiseDistancesRadiusNeighborhood) and will be ignored.", UserWarning, stacklevel=3, ) @@ -1613,23 +1851,25 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad DenseDenseDatasetsPair datasets_pair = self.datasets_pair ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - self.gemm_term_computer = GEMMTermComputer( + self.gemm_term_computer = GEMMTermComputer64( datasets_pair.X, datasets_pair.Y, self.effective_n_threads, self.chunks_n_threads, dist_middle_terms_chunks_size, + n_features=datasets_pair.X.shape[1], + chunk_size=self.chunk_size, ) if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") else: - self.Y_norm_squared = _sqeuclidean_row_norms(datasets_pair.Y, self.effective_n_threads) + self.Y_norm_squared = _sqeuclidean_row_norms64(datasets_pair.Y, self.effective_n_threads) # Do not recompute norms if datasets are identical. self.X_norm_squared = ( self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms(datasets_pair.X, self.effective_n_threads) + _sqeuclidean_row_norms64(datasets_pair.X, self.effective_n_threads) ) self.use_squared_distances = use_squared_distances @@ -1638,27 +1878,85 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad # already considered to be the adapted radius, so we overwrite it. self.r_radius = radius - @final - cdef void compute_exact_distances(self) nogil: - if not self.use_squared_distances: - PairwiseDistancesRadiusNeighborhood.compute_exact_distances(self) - @final cdef void _parallel_on_X_parallel_init( self, ITYPE_t thread_num, ) nogil: - PairwiseDistancesRadiusNeighborhood._parallel_on_X_parallel_init(self, thread_num) + PairwiseDistancesRadiusNeighborhood64._parallel_on_X_parallel_init(self, thread_num) self.gemm_term_computer._parallel_on_X_parallel_init(thread_num) + @final + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + PairwiseDistancesRadiusNeighborhood64._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) + self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) + + @final + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesRadiusNeighborhood64._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num, + ) + @final cdef void _parallel_on_Y_init( self, ) nogil: cdef ITYPE_t thread_num - PairwiseDistancesRadiusNeighborhood._parallel_on_Y_init(self) + PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() + @final + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) + self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) + + @final + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesRadiusNeighborhood64._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num + ) + + @final + cdef void compute_exact_distances(self) nogil: + if not self.use_squared_distances: + PairwiseDistancesRadiusNeighborhood64.compute_exact_distances(self) + @final cdef void _compute_and_reduce_distances_on_chunks( self, diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index fa475134c7a9f..b47407f3754ee 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -12,7 +12,7 @@ PairwiseDistancesReduction, PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood, - _sqeuclidean_row_norms, + _sqeuclidean_row_norms64, ) from sklearn.metrics import euclidean_distances @@ -967,6 +967,6 @@ def test_sqeuclidean_row_norms( X = rng.rand(n_samples, n_features).astype(dtype) * spread sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2 - sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads)) + sq_row_norm = np.asarray(_sqeuclidean_row_norms64(X, num_threads=num_threads)) assert_allclose(sq_row_norm_reference, sq_row_norm)