diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index ee0f75b92c789..0845eb0223516 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -74,8 +74,8 @@ Changelog - |Efficiency| Low-level routines for reductions on pairwise distances for dense float64 datasets have been refactored. The following functions - and estimators now benefit from improved performances, in particular on - multi-cores machines: + and estimators now benefit from improved performances in terms of hardware + scalability and speed-ups: - :func:`sklearn.metrics.pairwise_distances_argmin` - :func:`sklearn.metrics.pairwise_distances_argmin_min` - :class:`sklearn.cluster.AffinityPropagation` @@ -86,6 +86,8 @@ Changelog - :func:`sklearn.feature_selection.mutual_info_regression` - :class:`sklearn.neighbors.KNeighborsClassifier` - :class:`sklearn.neighbors.KNeighborsRegressor` + - :class:`sklearn.neighbors.RadiusNeighborsClassifier` + - :class:`sklearn.neighbors.RadiusNeighborsRegressor` - :class:`sklearn.neighbors.LocalOutlierFactor` - :class:`sklearn.neighbors.NearestNeighbors` - :class:`sklearn.manifold.Isomap` @@ -95,10 +97,11 @@ Changelog - :class:`sklearn.semi_supervised.LabelPropagation` - :class:`sklearn.semi_supervised.LabelSpreading` - For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors` - can be up to ×20 faster than in the previous versions'. + For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors` and + :class:`sklearn.neighbors.NearestNeighbors.radius_neighbors` + can respectively be up to ×20 and ×5 faster than previously. - :pr:`21987`, :pr:`22064`, :pr:`22065` and :pr:`22288` + :pr:`21987`, :pr:`22064`, :pr:`22065`, :pr:`22288` and :pr:`22320` by :user:`Julien Jerphanion ` - |Enhancement| All scikit-learn models now generate a more informative diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 7dae016b2334d..29ac839187fc9 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -20,8 +20,12 @@ import warnings from .. import get_config from libc.stdlib cimport free, malloc from libc.float cimport DBL_MAX +from libcpp.memory cimport shared_ptr, make_shared +from libcpp.vector cimport vector from cython cimport final +from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange +from cpython.ref cimport Py_INCREF from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair from ..utils._cython_blas cimport ( @@ -36,9 +40,10 @@ from ..utils._cython_blas cimport ( ) from ..utils._heap cimport simultaneous_sort, heap_push from ..utils._openmp_helpers cimport _openmp_thread_num -from ..utils._typedefs cimport ITYPE_t, DTYPE_t +from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t +from ..utils._typedefs cimport ITYPECODE, DTYPECODE -from numbers import Integral +from numbers import Integral, Real from typing import List from scipy.sparse import issparse from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING @@ -50,6 +55,106 @@ from ..utils._typedefs import ITYPE, DTYPE np.import_array() +# TODO: change for `libcpp.algorithm.move` once Cython 3 is used +# Introduction in Cython: +# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa +cdef extern from "" namespace "std" nogil: + OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa + +###################### +## std::vector to np.ndarray coercion +# As type covariance is not supported for C++ containers via Cython, +# we need to redefine fused types. +ctypedef fused vector_DITYPE_t: + vector[ITYPE_t] + vector[DTYPE_t] + + +ctypedef fused vector_vector_DITYPE_t: + vector[vector[ITYPE_t]] + vector[vector[DTYPE_t]] + + +cdef class StdVectorSentinel: + """Wraps a reference to a vector which will be deallocated with this object. + + When created, the StdVectorSentinel swaps the reference of its internal + vectors with the provided one (vec_ptr), thus making the StdVectorSentinel + manage the provided one's lifetime. + """ + pass + + +# We necessarily need to define two extension types extending StdVectorSentinel +# because we need to provide the dtype of the vector but can't use numeric fused types. +cdef class StdVectorSentinelDTYPE(StdVectorSentinel): + cdef vector[DTYPE_t] vec + + @staticmethod + cdef StdVectorSentinel create_for(vector[DTYPE_t] * vec_ptr): + # This initializes the object directly without calling __init__ + # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa + cdef StdVectorSentinelDTYPE sentinel = StdVectorSentinelDTYPE.__new__(StdVectorSentinelDTYPE) + sentinel.vec.swap(deref(vec_ptr)) + return sentinel + + +cdef class StdVectorSentinelITYPE(StdVectorSentinel): + cdef vector[ITYPE_t] vec + + @staticmethod + cdef StdVectorSentinel create_for(vector[ITYPE_t] * vec_ptr): + # This initializes the object directly without calling __init__ + # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa + cdef StdVectorSentinelITYPE sentinel = StdVectorSentinelITYPE.__new__(StdVectorSentinelITYPE) + sentinel.vec.swap(deref(vec_ptr)) + return sentinel + + +cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr): + """Create a numpy ndarray given a C++ vector. + + The numpy array buffer is the one of the C++ vector. + A StdVectorSentinel is registered as the base object for the numpy array, + freeing the C++ vector it encapsulates when the numpy array is freed. + """ + typenum = DTYPECODE if vector_DITYPE_t is vector[DTYPE_t] else ITYPECODE + cdef: + np.npy_intp size = deref(vect_ptr).size() + np.ndarray arr = np.PyArray_SimpleNewFromData(1, &size, typenum, + deref(vect_ptr).data()) + StdVectorSentinel sentinel + + if vector_DITYPE_t is vector[DTYPE_t]: + sentinel = StdVectorSentinelDTYPE.create_for(vect_ptr) + else: + sentinel = StdVectorSentinelITYPE.create_for(vect_ptr) + + # Makes the numpy array responsible of the life-cycle of its buffer. + # A reference to the StdVectorSentinel will be stolen by the call to + # `PyArray_SetBaseObject` below, so we increase its reference counter. + # See: https://docs.python.org/3/c-api/intro.html#reference-count-details + Py_INCREF(sentinel) + np.PyArray_SetBaseObject(arr, sentinel) + return arr + + +cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( + shared_ptr[vector_vector_DITYPE_t] vecs +): + """Coerce a std::vector of std::vector to a ndarray of ndarray.""" + cdef: + ITYPE_t n = deref(vecs).size() + np.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, + dtype=np.ndarray) + + for i in range(n): + nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i])) + + return nd_arrays_of_nd_arrays + +##################### + cpdef DTYPE_t[::1] _sqeuclidean_row_norms( const DTYPE_t[:, ::1] X, ITYPE_t num_threads, @@ -74,7 +179,7 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms( return squared_row_norms - +##################### cdef class PairwiseDistancesReduction: """Abstract base class for pairwise distance computation & reduction. @@ -1056,3 +1161,574 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): ), j + Y_start, ) + + +cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): + """Compute radius-based neighbors for two sets of vectors. + + For each row-vector X[i] of the queries X, find all the indices j of + row-vectors in Y such that: + + dist(X[i], Y[j]) <= radius + + The distance function `dist` depends on the values of the `metric` + and `metric_kwargs` parameters. + + Parameters + ---------- + datasets_pair: DatasetsPair + The dataset pair (X, Y) for the reduction. + + chunk_size: int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + radius: float + The radius defining the neighborhood. + """ + + cdef: + DTYPE_t radius + + # DistanceMetric compute rank-preserving surrogate distance via rdist + # which are proxies necessitating less computations. + # We get the equivalent for the radius to be able to compare it against + # vectors' rank-preserving surrogate distances. + DTYPE_t r_radius + + # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays. + # + # For this implementation, we want resizable buffers which we will wrap + # into numpy arrays at the end. std::vector comes as a handy interface + # for interacting efficiently with resizable buffers. + # + # Though it is possible to access their buffer address with + # std::vector::data, they can't be stolen: buffers lifetime + # is tied to their std::vector and are deallocated when + # std::vectors are. + # + # To solve this, we dynamically allocate std::vectors and then + # encapsulate them in a StdVectorSentinel responsible for + # freeing them when the associated np.ndarray is freed. + # + # Shared pointers (defined via shared_ptr) are use for safer memory management. + # Unique pointers (defined via unique_ptr) can't be used as datastructures + # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk. + shared_ptr[vector[vector[ITYPE_t]]] neigh_indices + shared_ptr[vector[vector[DTYPE_t]]] neigh_distances + + # Used as array of pointers to private datastructures used in threads. + vector[shared_ptr[vector[vector[ITYPE_t]]]] neigh_indices_chunks + vector[shared_ptr[vector[vector[DTYPE_t]]]] neigh_distances_chunks + + bint sort_results + + @classmethod + def compute( + cls, + X, + Y, + DTYPE_t radius, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + bint return_distance=False, + bint sort_results=False, + ): + """Return the results of the reduction for the given arguments. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + radius : float + The radius defining the neighborhood. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` + is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity + for parallelism and is therefore more efficient despite the synchronization + step at each iteration of the outer loop on chunks of `X`. + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + return_distance : boolean, default=False + Return distances between each X vector and its neighbors if set to True. + + sort_results : boolean, default=False + Sort results with respect to distances between each X vector and its + neighbors if set to True. + + Returns + ------- + If return_distance=False: + - neighbors_indices : ndarray of n_samples_X ndarray + Indices of the neighbors for each vector in X. + + If return_distance=True: + - neighbors_indices : ndarray of n_samples_X ndarray + Indices of the neighbors for each vector in X. + - neighbors_distances : ndarray of n_samples_X ndarray + Distances to the neighbors for each vector in X. + + Notes + ----- + This public classmethod is responsible for introspecting the arguments + values to dispatch to the private + :meth:`PairwiseDistancesRadiusNeighborhood._compute` instance method of + the most appropriate :class:`PairwiseDistancesRadiusNeighborhood` + concrete implementation. + + All temporarily allocated datastructures necessary for the concrete + implementation are therefore freed when this classmethod returns. + + This allows entirely decoupling the interface entirely from the + implementation details whilst maintaining RAII. + """ + # Note (jjerphan): Some design thoughts for future extensions. + # This factory comes to handle specialisations for the given arguments. + # For future work, this might can be an entrypoint to specialise operations + # for various backend and/or hardware and/or datatypes, and/or fused + # {sparse, dense}-datasetspair etc. + if ( + metric in ("euclidean", "sqeuclidean") + and not issparse(X) + and not issparse(Y) + ): + # Specialized implementation with improved arithmetic intensity + # and vector instructions (SIMD) by processing several vectors + # at time to leverage a call to the BLAS GEMM routine as explained + # in more details in the docstring. + use_squared_distances = metric == "sqeuclidean" + pda = FastEuclideanPairwiseDistancesRadiusNeighborhood( + X=X, Y=Y, radius=radius, + use_squared_distances=use_squared_distances, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + sort_results=sort_results, + ) + else: + # Fall back on a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pda = PairwiseDistancesRadiusNeighborhood( + datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), + radius=radius, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + sort_results=sort_results, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with threadpool_limits(limits=1, user_api="blas"): + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results(return_distance) + + + def __init__( + self, + DatasetsPair datasets_pair, + DTYPE_t radius, + chunk_size=None, + strategy=None, + sort_results=False, + metric_kwargs=None, + ): + super().__init__( + datasets_pair=datasets_pair, + chunk_size=chunk_size, + strategy=strategy, + ) + + self.radius = check_scalar(radius, "radius", Real, min_val=0) + self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius) + self.sort_results = sort_results + + # Allocating pointers to datastructures but not the datastructures themselves. + # There are as many pointers as effective threads. + # + # For the sake of explicitness: + # - when parallelizing on X, the pointers of those heaps are referencing + # self.neigh_distances and self.neigh_indices + # - when parallelizing on Y, the pointers of those heaps are referencing + # std::vectors of std::vectors which are thread-wise-allocated and whose + # content will be merged into self.neigh_distances and self.neigh_indices. + self.neigh_distances_chunks = vector[shared_ptr[vector[vector[DTYPE_t]]]]( + self.chunks_n_threads + ) + self.neigh_indices_chunks = vector[shared_ptr[vector[vector[ITYPE_t]]]]( + self.chunks_n_threads + ) + + # Temporary datastructures which will be coerced to numpy arrays on before + # PairwiseDistancesRadiusNeighborhood.compute "return" and will be then freed. + self.neigh_distances = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X) + self.neigh_indices = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X) + + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + DTYPE_t r_dist_i_j + + for i in range(X_start, X_end): + for j in range(Y_start, Y_end): + r_dist_i_j = self.datasets_pair.surrogate_dist(i, j) + if r_dist_i_j <= self.r_radius: + deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j) + deref(self.neigh_indices_chunks[thread_num])[i].push_back(j) + + def _finalize_results(self, bint return_distance=False): + if return_distance: + # We need to recompute distances because we relied on + # surrogate distances for the reduction. + self.compute_exact_distances() + return ( + coerce_vectors_to_nd_arrays(self.neigh_distances), + coerce_vectors_to_nd_arrays(self.neigh_indices), + ) + + return coerce_vectors_to_nd_arrays(self.neigh_indices) + + @final + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ) nogil: + + # As this strategy is embarrassingly parallel, we can set the + # thread vectors' pointers to the main vectors'. + self.neigh_distances_chunks[thread_num] = self.neigh_distances + self.neigh_indices_chunks[thread_num] = self.neigh_indices + + @final + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, jdx + + # Sorting neighbors for each query vector of X + if self.sort_results: + for idx in range(X_start, X_end): + simultaneous_sort( + deref(self.neigh_distances)[idx].data(), + deref(self.neigh_indices)[idx].data(), + deref(self.neigh_indices)[idx].size() + ) + + cdef void _parallel_on_Y_init( + self, + ) nogil: + cdef: + ITYPE_t thread_num + # As chunks of X are shared across threads, so must datastructures to avoid race + # conditions: each thread has its own vectors of n_samples_X vectors which are + # then merged back in the main n_samples_X vectors. + for thread_num in range(self.chunks_n_threads): + self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[DTYPE_t]]](self.n_samples_X) + self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[ITYPE_t]]](self.n_samples_X) + + @final + cdef void _merge_vectors( + self, + ITYPE_t idx, + ITYPE_t num_threads, + ) nogil: + cdef: + ITYPE_t thread_num + ITYPE_t idx_n_elements = 0 + ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size() + + # Resizing buffers only once for the given number of elements. + for thread_num in range(num_threads): + idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size() + + deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements) + deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements) + + # Moving the elements by range using the range first element + # as the reference for the insertion. + for thread_num in range(num_threads): + move( + deref(self.neigh_distances_chunks[thread_num])[idx].begin(), + deref(self.neigh_distances_chunks[thread_num])[idx].end(), + deref(self.neigh_distances)[idx].begin() + last_element_idx + ) + move( + deref(self.neigh_indices_chunks[thread_num])[idx].begin(), + deref(self.neigh_indices_chunks[thread_num])[idx].end(), + deref(self.neigh_indices)[idx].begin() + last_element_idx + ) + last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size() + + + cdef void _parallel_on_Y_finalize( + self, + ) nogil: + cdef: + ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current + + with nogil, parallel(num_threads=self.effective_n_threads): + # Merge vectors used in threads into the main ones. + # This is done in parallel sample-wise (no need for locks) + # using dynamic scheduling because we might not have + # the same number of neighbors for each query vector. + # TODO: compare 'dynamic' vs 'static' vs 'guided' + for idx in prange(self.n_samples_X, schedule='dynamic'): + self._merge_vectors(idx, self.chunks_n_threads) + + # The content of the vector have been std::moved. + # Hence they can't be used anymore and can be deleted. + # Their deletion is carried out automatically as the + # implementation relies on shared pointers. + + # Sort in parallel in ascending order w.r.t the distances if requested. + if self.sort_results: + for idx in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + deref(self.neigh_distances)[idx].data(), + deref(self.neigh_indices)[idx].data(), + deref(self.neigh_indices)[idx].size() + ) + + return + + cdef void compute_exact_distances(self) nogil: + """Convert rank-preserving distances to pairwise distances in parallel.""" + cdef: + ITYPE_t i, j + + for i in prange(self.n_samples_X, nogil=True, schedule='dynamic', + num_threads=self.effective_n_threads): + for j in range(deref(self.neigh_indices)[i].size()): + deref(self.neigh_distances)[i][j] = ( + self.datasets_pair.distance_metric._rdist_to_dist( + # Guard against eventual -0., causing nan production. + max(deref(self.neigh_distances)[i][j], 0.) + ) + ) + + +cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRadiusNeighborhood): + """Fast specialized alternative for PairwiseDistancesRadiusNeighborhood on EuclideanDistance. + + The full pairwise squared distances matrix is computed as follows: + + ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||² + + The middle term gets computed efficiently bellow using BLAS Level 3 GEMM. + + Notes + ----- + This implementation has a superior arithmetic intensity and hence + better running time when the alternative is IO bound, but it can suffer + from numerical instability caused by catastrophic cancellation potentially + introduced by the subtraction in the arithmetic expression above. + numerical precision is needed. + """ + + cdef: + const DTYPE_t[:, ::1] X + const DTYPE_t[:, ::1] Y + const DTYPE_t[::1] X_norm_squared + const DTYPE_t[::1] Y_norm_squared + + # Buffers for GEMM + vector[vector[DTYPE_t]] dist_middle_terms_chunks + bint use_squared_distances + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + return (PairwiseDistancesRadiusNeighborhood.is_usable_for(X, Y, metric) + and not _in_unstable_openblas_configuration()) + + def __init__( + self, + X, + Y, + DTYPE_t radius, + bint use_squared_distances=False, + chunk_size=None, + strategy=None, + sort_results=False, + metric_kwargs=None, + ): + super().__init__( + # The datasets pair here is used for exact distances computations + datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"), + radius=radius, + chunk_size=chunk_size, + strategy=strategy, + sort_results=sort_results, + metric_kwargs=metric_kwargs, + ) + # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair + cdef: + DenseDenseDatasetsPair datasets_pair = self.datasets_pair + self.X, self.Y = datasets_pair.X, datasets_pair.Y + + if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: + self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") + else: + self.Y_norm_squared = _sqeuclidean_row_norms(self.Y, self.effective_n_threads) + + # Do not recompute norms if datasets are identical. + self.X_norm_squared = ( + self.Y_norm_squared if X is Y else + _sqeuclidean_row_norms(self.X, self.effective_n_threads) + ) + self.use_squared_distances = use_squared_distances + + if use_squared_distances: + # In this specialisation and this setup, the value passed to the radius is + # already considered to be the adapted radius, so we overwrite it. + self.r_radius = radius + + # Temporary datastructures used in threads + self.dist_middle_terms_chunks = vector[vector[DTYPE_t]]( + self.effective_n_threads + ) + + @final + cdef void compute_exact_distances(self) nogil: + if not self.use_squared_distances: + PairwiseDistancesRadiusNeighborhood.compute_exact_distances(self) + + @final + cdef void _parallel_on_X_parallel_init( + self, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesRadiusNeighborhood._parallel_on_X_parallel_init(self, thread_num) + + # Temporary buffer for the `-2 * X_c @ Y_c.T` term + self.dist_middle_terms_chunks[thread_num].resize( + self.Y_n_samples_chunk * self.X_n_samples_chunk + ) + + @final + cdef void _parallel_on_Y_init( + self, + ) nogil: + cdef ITYPE_t thread_num + PairwiseDistancesRadiusNeighborhood._parallel_on_Y_init(self) + + for thread_num in range(self.chunks_n_threads): + # Temporary buffer for the `-2 * X_c @ Y_c.T` term + self.dist_middle_terms_chunks[thread_num].resize( + self.Y_n_samples_chunk * self.X_n_samples_chunk + ) + + @final + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + DTYPE_t squared_dist_i_j + + const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :] + const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] + DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() + + # Careful: LDA, LDB and LDC are given for F-ordered arrays + # in BLAS documentations, for instance: + # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa + # + # Here, we use their counterpart values to work with C-ordered arrays. + BLAS_Order order = RowMajor + BLAS_Trans ta = NoTrans + BLAS_Trans tb = Trans + ITYPE_t m = X_c.shape[0] + ITYPE_t n = Y_c.shape[0] + ITYPE_t K = X_c.shape[1] + DTYPE_t alpha = - 2. + # Casting for A and B to remove the const is needed because APIs exposed via + # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. + # See: https://github.com/scipy/scipy/issues/14262 + DTYPE_t * A = &X_c[0, 0] + ITYPE_t lda = X_c.shape[1] + DTYPE_t * B = &Y_c[0, 0] + ITYPE_t ldb = X_c.shape[1] + DTYPE_t beta = 0. + ITYPE_t ldc = Y_c.shape[0] + + # dist_middle_terms = `-2 * X_c @ Y_c.T` + _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) + + # Pushing the distance and their associated indices in vectors. + for i in range(X_c.shape[0]): + for j in range(Y_c.shape[0]): + # Using the squared euclidean distance as the rank-preserving distance: + # + # ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + # + squared_dist_i_j = ( + self.X_norm_squared[i + X_start] + + dist_middle_terms[i * Y_c.shape[0] + j] + + self.Y_norm_squared[j + Y_start] + ) + if squared_dist_i_j <= self.r_radius: + deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j) + deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start) diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py index 1c26d9969397c..736ba6d7d4424 100644 --- a/sklearn/metrics/setup.py +++ b/sklearn/metrics/setup.py @@ -30,7 +30,9 @@ def configuration(parent_package="", top_path=None): "_pairwise_distances_reduction", sources=["_pairwise_distances_reduction.pyx"], include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")], + language="c++", libraries=libraries, + extra_compile_args=["-std=c++11"], ) config.add_subpackage("tests") diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index d2af7e7988aa8..308eece1fb6df 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -8,6 +8,7 @@ from sklearn.metrics._pairwise_distances_reduction import ( PairwiseDistancesReduction, PairwiseDistancesArgKmin, + PairwiseDistancesRadiusNeighborhood, _sqeuclidean_row_norms, ) @@ -30,7 +31,7 @@ ] -def _get_dummy_metric_params_list(metric: str, n_features: int): +def _get_metric_params_list(metric: str, n_features: int): """Return list of dummy DistanceMetric kwargs for tests.""" # Distinguishing on cases not to compute unneeded datastructures. @@ -79,8 +80,25 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices): ) +def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices): + # We get arrays of arrays and we need to check for individual pairs + for i in range(ref_dist.shape[0]): + assert_array_equal( + ref_indices[i], + indices[i], + err_msg=f"Query vector #{i} has different neighbors' indices", + ) + assert_allclose( + ref_dist[i], + dist[i], + err_msg=f"Query vector #{i} has different neighbors' distances", + rtol=1e-7, + ) + + ASSERT_RESULT = { PairwiseDistancesArgKmin: assert_argkmin_results_equality, + PairwiseDistancesRadiusNeighborhood: assert_radius_neighborhood_results_equality, } @@ -149,12 +167,62 @@ def test_argkmin_factory_method_wrong_usages(): ) +def test_radius_neighborhood_factory_method_wrong_usages(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + radius = 5 + metric = "euclidean" + + with pytest.raises( + ValueError, + match=( + "Only 64bit float datasets are supported at this time, " + "got: X.dtype=float32 and Y.dtype=float64" + ), + ): + PairwiseDistancesRadiusNeighborhood.compute( + X=X.astype(np.float32), Y=Y, radius=radius, metric=metric + ) + + with pytest.raises( + ValueError, + match=( + "Only 64bit float datasets are supported at this time, " + "got: X.dtype=float64 and Y.dtype=int32" + ), + ): + PairwiseDistancesRadiusNeighborhood.compute( + X=X, Y=Y.astype(np.int32), radius=radius, metric=metric + ) + + with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."): + PairwiseDistancesRadiusNeighborhood.compute(X=X, Y=Y, radius=-1, metric=metric) + + with pytest.raises(ValueError, match="Unrecognized metric"): + PairwiseDistancesRadiusNeighborhood.compute( + X=X, Y=Y, radius=radius, metric="wrong metric" + ) + + with pytest.raises( + ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)" + ): + PairwiseDistancesRadiusNeighborhood.compute( + X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric + ) + + with pytest.raises(ValueError, match="ndarray is not C-contiguous"): + PairwiseDistancesRadiusNeighborhood.compute( + X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric + ) + + @pytest.mark.parametrize("seed", range(5)) @pytest.mark.parametrize("n_samples", [100, 1000]) @pytest.mark.parametrize("chunk_size", [50, 512, 1024]) @pytest.mark.parametrize( "PairwiseDistancesReduction", - [PairwiseDistancesArgKmin], + [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], ) def test_chunk_size_agnosticism( PairwiseDistancesReduction, @@ -200,7 +268,7 @@ def test_chunk_size_agnosticism( @pytest.mark.parametrize("chunk_size", [50, 512, 1024]) @pytest.mark.parametrize( "PairwiseDistancesReduction", - [PairwiseDistancesArgKmin], + [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], ) def test_n_threads_agnosticism( PairwiseDistancesReduction, @@ -245,7 +313,7 @@ def test_n_threads_agnosticism( @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics()) @pytest.mark.parametrize( "PairwiseDistancesReduction", - [PairwiseDistancesArgKmin], + [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], ) def test_strategies_consistency( PairwiseDistancesReduction, @@ -279,7 +347,7 @@ def test_strategies_consistency( parameter, metric=metric, # Taking the first - metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0], + metric_kwargs=_get_metric_params_list(metric, n_features)[0], # To be sure to use parallelization chunk_size=n_samples // 4, strategy="parallel_on_X", @@ -292,7 +360,7 @@ def test_strategies_consistency( parameter, metric=metric, # Taking the first - metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0], + metric_kwargs=_get_metric_params_list(metric, n_features)[0], # To be sure to use parallelization chunk_size=n_samples // 4, strategy="parallel_on_Y", @@ -307,7 +375,7 @@ def test_strategies_consistency( ) -# Concrete PairwiseDistancesReductions tests +# "Concrete PairwiseDistancesReductions"-specific tests # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @@ -334,7 +402,7 @@ def test_pairwise_distances_argkmin( X = np.ascontiguousarray(X[:, :2]) Y = np.ascontiguousarray(Y[:, :2]) - metric_kwargs = _get_dummy_metric_params_list(metric, n_features)[0] + metric_kwargs = _get_metric_params_list(metric, n_features)[0] # Reference for argkmin results if metric == "euclidean": @@ -368,6 +436,70 @@ def test_pairwise_distances_argkmin( ) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") +@pytest.mark.parametrize("n_features", [50, 500]) +@pytest.mark.parametrize("translation", [0, 1e6]) +@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS) +@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y")) +def test_pairwise_distances_radius_neighbors( + n_features, + translation, + metric, + strategy, + n_samples=100, + dtype=np.float64, +): + rng = np.random.RandomState(0) + spread = 1000 + radius = spread * np.log(n_features) + X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + + metric_kwargs = _get_metric_params_list(metric, n_features)[0] + + # Reference for argkmin results + if metric == "euclidean": + # Compare to scikit-learn GEMM optimized implementation + dist_matrix = euclidean_distances(X, Y) + else: + dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs) + + # Getting the neighbors for a given radius + neigh_indices_ref = [] + neigh_distances_ref = [] + + for row in dist_matrix: + ind = np.arange(row.shape[0])[row <= radius] + dist = row[ind] + + sort = np.argsort(dist) + ind, dist = ind[sort], dist[sort] + + neigh_indices_ref.append(ind) + neigh_distances_ref.append(dist) + + neigh_indices_ref = np.array(neigh_indices_ref) + neigh_distances_ref = np.array(neigh_distances_ref) + + neigh_distances, neigh_indices = PairwiseDistancesRadiusNeighborhood.compute( + X, + Y, + radius, + metric=metric, + metric_kwargs=metric_kwargs, + return_distance=True, + # So as to have more than a chunk, forcing parallelism. + chunk_size=n_samples // 4, + strategy=strategy, + sort_results=True, + ) + + ASSERT_RESULT[PairwiseDistancesRadiusNeighborhood]( + neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref + ) + + @pytest.mark.parametrize("seed", range(10)) @pytest.mark.parametrize("n_samples", [100, 1000]) @pytest.mark.parametrize("n_features", [5, 10, 100]) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index c29cace6d01a9..10e99a34e6497 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -24,6 +24,7 @@ from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..metrics._pairwise_distances_reduction import ( PairwiseDistancesArgKmin, + PairwiseDistancesRadiusNeighborhood, ) from ..utils import ( check_array, @@ -1070,20 +1071,43 @@ class from an array representing our data set and ask who's """ check_is_fitted(self) - if X is not None: - query_is_train = False + if sort_results and not return_distance: + raise ValueError("return_distance must be True if sort_results is True.") + + query_is_train = X is None + if query_is_train: + X = self._fit_X + else: if self.metric == "precomputed": X = _check_precomputed(X) else: - X = self._validate_data(X, accept_sparse="csr", reset=False) - else: - query_is_train = True - X = self._fit_X + X = self._validate_data(X, accept_sparse="csr", reset=False, order="C") if radius is None: radius = self.radius - if self._fit_method == "brute" and self.metric == "precomputed" and issparse(X): + use_pairwise_distances_reductions = ( + self._fit_method == "brute" + and PairwiseDistancesRadiusNeighborhood.is_usable_for( + X if X is not None else self._fit_X, self._fit_X, self.effective_metric_ + ) + ) + + if use_pairwise_distances_reductions: + results = PairwiseDistancesRadiusNeighborhood.compute( + X=X, + Y=self._fit_X, + radius=radius, + metric=self.effective_metric_, + metric_kwargs=self.effective_metric_params_, + strategy="auto", + return_distance=return_distance, + sort_results=sort_results, + ) + + elif ( + self._fit_method == "brute" and self.metric == "precomputed" and issparse(X) + ): results = _radius_neighbors_from_graph( X, radius=radius, return_distance=return_distance ) @@ -1126,10 +1150,6 @@ class from an array representing our data set and ask who's results = _to_object_array(neigh_ind_list) if sort_results: - if not return_distance: - raise ValueError( - "return_distance must be True if sort_results is True." - ) for ii in range(len(neigh_dist)): order = np.argsort(neigh_dist[ii], kind="mergesort") neigh_ind[ii] = neigh_ind[ii][order] diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index a1e0b01ef3eeb..e7ee8a507838e 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -25,6 +25,9 @@ from sklearn.exceptions import NotFittedError from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS +from sklearn.metrics.tests.test_pairwise_distances_reduction import ( + assert_radius_neighborhood_results_equality, +) from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split from sklearn.neighbors import VALID_METRICS_SPARSE @@ -2035,6 +2038,61 @@ def test_neighbors_distance_metric_deprecation(): assert isinstance(dist_metric, ActualDistanceMetric) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") +@pytest.mark.parametrize( + "metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"])) +) +def test_radius_neighbors_brute_backend( + metric, n_samples=2000, n_features=30, n_query_pts=100, n_neighbors=5 +): + # Both backends for the 'brute' algorithm of radius_neighbors + # must give identical results. + X_train = rng.rand(n_samples, n_features) + X_test = rng.rand(n_query_pts, n_features) + + # Haversine distance only accepts 2D data + if metric == "haversine": + feature_sl = slice(None, 2) + X_train = np.ascontiguousarray(X_train[:, feature_sl]) + X_test = np.ascontiguousarray(X_test[:, feature_sl]) + + metric_params_list = _generate_test_params_for(metric, n_features) + + # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 + ExceptionToAssert = None + if metric == "wminkowski" and sp_version >= parse_version("1.6.0"): + ExceptionToAssert = FutureWarning + + for metric_params in metric_params_list: + p = metric_params.pop("p", 2) + + neigh = neighbors.NearestNeighbors( + n_neighbors=n_neighbors, + algorithm="brute", + metric=metric, + p=p, + metric_params=metric_params, + ) + + neigh.fit(X_train) + with pytest.warns(ExceptionToAssert): + with config_context(enable_cython_pairwise_dist=False): + # Use the legacy backend for brute + legacy_brute_dst, legacy_brute_idx = neigh.radius_neighbors( + X_test, return_distance=True + ) + with config_context(enable_cython_pairwise_dist=True): + # Use the PairwiseDistancesReduction as a backend for brute + pdr_brute_dst, pdr_brute_idx = neigh.radius_neighbors( + X_test, return_distance=True + ) + + assert_radius_neighborhood_results_equality( + legacy_brute_dst, pdr_brute_dst, legacy_brute_idx, pdr_brute_idx + ) + + def test_valid_metrics_has_no_duplicate(): for val in neighbors.VALID_METRICS.values(): assert len(val) == len(set(val))