From de166e03100a33d7b0da900a88eacabbbc47e1e6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Dec 2021 10:15:33 +0100 Subject: [PATCH 01/22] MAINT Introduce Pairwise Distances Reductions private submodule This introduces the neccessary private implementations for a new private submodule, i.e.: - DatasetsPair, an abstraction to wrap a pair of two datasets and compute their vectors pairwise distances - DenseDenseDatasetsPair, a first implementation of DatasetsPair for pair of two dense datasets - PairwiseDistancesReduction, an abstraction allowing computing reductions efficiently in parallel and of - PairwiseDistancesArgkmin, a first implementation of PairwiseDistancesReduction for k-Nearest Neighbors search --- sklearn/metrics/_dist_metrics.pxd | 21 + sklearn/metrics/_dist_metrics.pyx | 188 +++- .../metrics/_pairwise_distances_reduction.pyx | 821 ++++++++++++++++++ sklearn/metrics/setup.py | 6 + .../test_pairwise_distances_reduction.py | 379 ++++++++ sklearn/utils/__init__.py | 35 +- sklearn/utils/_openmp_helpers.pxd | 6 + sklearn/utils/_openmp_helpers.pyx | 15 +- sklearn/utils/_testing.py | 11 +- 9 files changed, 1468 insertions(+), 14 deletions(-) create mode 100644 sklearn/metrics/_pairwise_distances_reduction.pyx create mode 100644 sklearn/metrics/tests/test_pairwise_distances_reduction.py create mode 100644 sklearn/utils/_openmp_helpers.pxd diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd index 611f6759e2c8b..e7c2f2ea2f926 100644 --- a/sklearn/metrics/_dist_metrics.pxd +++ b/sklearn/metrics/_dist_metrics.pxd @@ -64,3 +64,24 @@ cdef class DistanceMetric: cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1 cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1 + + +###################################################################### +# DatasetsPair base class +cdef class DatasetsPair: + cdef DistanceMetric distance_metric + + cdef ITYPE_t n_samples_X(self) nogil + + cdef ITYPE_t n_samples_Y(self) nogil + + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil + + cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil + + +cdef class DenseDenseDatasetsPair(DatasetsPair): + cdef: + const DTYPE_t[:, ::1] X + const DTYPE_t[:, ::1] Y + ITYPE_t d diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index f7d22c1badfa2..3def08da7965c 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -4,6 +4,8 @@ import numpy as np cimport numpy as np +from cython cimport final + np.import_array() # required in order to use C-API @@ -23,10 +25,10 @@ cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n): return PyArray_SimpleNewFromData(1, &n, DTYPECODE, x) -# some handy constants from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin cdef DTYPE_t INF = np.inf +from scipy.sparse import csr_matrix, issparse from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE from ..utils._typedefs import DTYPE, ITYPE from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper @@ -67,6 +69,16 @@ METRIC_MAPPING = {'euclidean': EuclideanDistance, 'haversine': HaversineDistance, 'pyfunc': PyFuncDistance} +BOOL_METRICS = [ + "matching", + "jaccard", + "dice", + "kulsinski", + "rogerstanimoto", + "russellrao", + "sokalmichener", + "sokalsneath", +] def get_valid_metric_ids(L): """Given an iterable of metric class names or class identifiers, @@ -195,8 +207,8 @@ cdef class DistanceMetric: """ def __cinit__(self): self.p = 2 - self.vec = np.zeros(1, dtype=DTYPE, order='c') - self.mat = np.zeros((1, 1), dtype=DTYPE, order='c') + self.vec = np.zeros(1, dtype=DTYPE, order='C') + self.mat = np.zeros((1, 1), dtype=DTYPE, order='C') self.size = 1 def __reduce__(self): @@ -306,8 +318,9 @@ cdef class DistanceMetric: This can optionally be overridden in a base class. The rank-preserving surrogate distance is any measure that yields the same - rank as the distance, but is more efficient to compute. For example, for the - Euclidean metric, the surrogate distance is the squared-euclidean distance. + rank as the distance, but is more efficient to compute. For example, the + rank-preserving surrogate distance of the Euclidean metric is the + squared-euclidean distance. """ return self.dist(x1, x2, size) @@ -343,8 +356,9 @@ cdef class DistanceMetric: """Convert the rank-preserving surrogate distance to the distance. The surrogate distance is any measure that yields the same rank as the - distance, but is more efficient to compute. For example, for the - Euclidean metric, the surrogate distance is the squared-euclidean distance. + distance, but is more efficient to compute. For example, the + rank-preserving surrogate distance of the Euclidean metric is the + squared-euclidean distance. Parameters ---------- @@ -362,8 +376,9 @@ cdef class DistanceMetric: """Convert the true distance to the rank-preserving surrogate distance. The surrogate distance is any measure that yields the same rank as the - distance, but is more efficient to compute. For example, for the - Euclidean metric, the surrogate distance is the squared-euclidean distance. + distance, but is more efficient to compute. For example, the + rank-preserving surrogate distance of the Euclidean metric is the + squared-euclidean distance. Parameters ---------- @@ -1150,3 +1165,158 @@ cdef class PyFuncDistance(DistanceMetric): cdef inline double fmax(double a, double b) nogil: return max(a, b) + + +###################################################################### +# Datasets Pair Classes +cdef class DatasetsPair: + """Abstract class which wraps a pair of datasets (X, Y). + + This class allows computing distances between a single pair of rows of + of X and Y at a time given the pair of their indices (i, j). This class is + specialized for each metric thanks to the :func:`get_for` factory classmethod. + + The handling of parallelization over chunks to compute the distances + and aggregation for several rows at a time is done in dedicated + subclasses of PairwiseDistancesReduction that in-turn rely on + subclasses of DatasetsPair for each pair of rows in the data. The goal + is to make it possible to decouple the generic parallelization and + aggregation logic from metric-specific computation as much as + possible. + + X and Y can be stored as np.ndarrays or CSR matrices in subclasses. + + This class avoids the overhead of dispatching distance computations + to :class:`sklearn.metrics.DistanceMetric` based on the physical + representation of the vectors (sparse vs. dense). It makes use of + cython.final to remove the overhead of dispatching method calls. + + Parameters + ---------- + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + @classmethod + def get_for( + cls, + X, + Y, + str metric="euclidean", + dict metric_kwargs=None, + ) -> DatasetsPair: + """Return the DatasetsPair implementation for the given arguments. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + metric : str, default='euclidean' + The distance metric to use for argkmin. The default metric is + a fast implementation of the standard Euclidean metric. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + Returns + ------- + datasets_pair: DatasetsPair + The suited DatasetsPair implementation. + """ + cdef: + DistanceMetric distance_metric = DistanceMetric.get_metric( + metric, + **(metric_kwargs or {}) + ) + + if X.dtype != np.float64 or Y.dtype != np.float64: + raise ValueError("Only 64bit float datasets are supported for X and Y.") + + # Metric-specific checks that do not replace nor duplicate `check_array`. + distance_metric._validate_data(X) + distance_metric._validate_data(Y) + + if issparse(X) or issparse(Y): + raise ValueError("Only dense datasets are supported for X and Y.") + + return DenseDenseDatasetsPair(X, Y, distance_metric) + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure getting ITYPE instead of int internally used for CSR matrices.""" + X_data = np.asarray(X.data, dtype=DTYPE) + X_indices = np.asarray(X.indices, dtype=ITYPE) + X_indptr = np.asarray(X.indptr, dtype=ITYPE) + return X_data, X_indptr, X_indptr + + def __init__(self, DistanceMetric distance_metric): + self.distance_metric = distance_metric + + cdef ITYPE_t n_samples_X(self) nogil: + """Number of samples in X.""" + return -999 + + cdef ITYPE_t n_samples_Y(self) nogil: + """Number of samples in Y.""" + return -999 + + cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.dist(i, j) + + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + return -1 + +@final +cdef class DenseDenseDatasetsPair(DatasetsPair): + """Compute distances between vectors of two arrays. + + Parameters + ---------- + X: ndarray of shape (n_samples_X, n_features) + Rows represent vectors. Must be C-contiguous. + + Y: ndarray of shape (n_samples_Y, n_features) + Rows represent vectors. Must be C-contiguous. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + def __init__(self, X, Y, DistanceMetric distance_metric): + super().__init__(distance_metric) + # Arrays have already been checked + self.X = X + self.Y = Y + self.d = X.shape[1] + + @final + cdef ITYPE_t n_samples_X(self) nogil: + return self.X.shape[0] + + @final + cdef ITYPE_t n_samples_Y(self) nogil: + return self.Y.shape[0] + + @final + cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.distance_metric.rdist(&self.X[i, 0], + &self.Y[j, 0], + self.d) + + @final + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.distance_metric.dist(&self.X[i, 0], + &self.Y[j, 0], + self.d) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx new file mode 100644 index 0000000000000..d08b81d48a58c --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -0,0 +1,821 @@ +# Pairwise Distances Reductions +# ============================= +# +# Author: Julien Jerphanion +# +# +# The routines defined here are used in various algorithms performing +# the same structure of operations on distances between vectors +# of a datasets pair (X, Y). +# +# Importantly, the core of the computation is chunked to make sure that the pairwise +# distance chunk matrices stay in CPU cache before applying the final reduction step. +# Furthermore, the chunking strategy is also used to leverage OpenMP-based parallelism +# (using Cython prange loops) which gives another multiplicative speed-up in +# favorable cases on many-core machines. +cimport numpy as np +import numpy as np +import warnings +import scipy.sparse + +from .. import get_config +from libc.stdlib cimport free, malloc +from libc.float cimport DBL_MAX +from cython cimport final +from cython.parallel cimport parallel, prange + +from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair +from ..utils._cython_blas cimport ( + BLAS_Order, + BLAS_Trans, + ColMajor, + NoTrans, + RowMajor, + Trans, + _dot, + _gemm, +) +from ..utils._heap cimport simultaneous_sort, heap_push +from ..utils._openmp_helpers cimport _openmp_thread_num +from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t + +from numbers import Integral, Real +from typing import List +from scipy.sparse import issparse +from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING +from ..utils import check_scalar, _in_unstable_openblas_configuration +from ..utils.fixes import threadpool_limits +from ..utils._openmp_helpers import _openmp_effective_n_threads +from ..utils._typedefs import ITYPE, DTYPE + +np.import_array() + +cpdef DTYPE_t[::1] _sqeuclidean_row_norms( + const DTYPE_t[:, ::1] X, + ITYPE_t num_threads, +): + """Compute the squared euclidean norm of the rows of X in parallel. + + This is faster than using np.einsum("ij, ij->i") even when using a single thread. + """ + cdef: + # Casting for X to remove the const qualifier is needed because APIs + # exposed via scipy.linalg.cython_blas aren't reflecting the arguments' + # const qualifier. + DTYPE_t * X_ptr = &X[0, 0] + ITYPE_t idx = 0 + ITYPE_t n = X.shape[0] + ITYPE_t d = X.shape[1] + DTYPE_t[::1] row_norms = np.empty(n, dtype=DTYPE) + + for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): + row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1) + + return row_norms + +cdef class PairwiseDistancesReduction: + """Abstract base class for pairwise distance computation & reduction + + Subclasses of this class compute pairwise distances between a set of + vectors (rows) X and another set of vectors (rows) Y and apply a + reduction on top. The reduction takes a matrix of pairwise distances + between rows of X and Y as input and outputs an aggregate data-structure + for each row of X. The aggregate values are typically smaller than the number + of rows in Y, hence the term reduction. + + For computational reasons, it is interesting to perform the reduction on + the fly on chunks of rows of X and Y so as to keep intermediate + data-structures in CPU cache and avoid unnecessary round trips of large + distance arrays with the RAM that would otherwise severely degrade the + speed by making the overall processing memory-bound. + + The base class provides the generic chunked parallelization template using + OpenMP loops (Cython prange), either on rows of X or rows of Y depending on + their respective sizes. + + The subclasses are specialized for reduction. + + The actual distance computation for a given pair of rows of X and Y are + delegated to format-specific subclasses of the DatasetsPair companion base + class. + + Parameters + ---------- + datasets_pair: DatasetsPair + The pair of dataset to use. + + chunk_size: int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + n_threads: int, default=None + The number of OpenMP threads to use for the reduction. + Parallelism is done on chunks and the sharding of chunks + depends on the `strategy` set on :method:`~PairwiseDistancesReduction.compute`. + + See _openmp_effective_n_threads, for details about + the specification of n_threads. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + Strategies differs on the dispatching they use for chunks on threads: + + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread then iterates on all the chunks of X. This strategy is + embarrassingly parallel but uses intermediate datastructures + synchronisation. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y'. + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + """ + + cdef: + readonly DatasetsPair datasets_pair + + # The number of threads that can be used is stored in effective_n_threads. + # + # The number of threads to use in the parallelisation strategy + # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads: + # for small datasets, less threads might be needed to loop over pair of chunks. + # + # Hence the number of threads that _will_ be used for looping over chunks + # is stored in chunks_n_threads, allowing solely using what we need. + # + # Thus, an invariant is: + # + # chunks_n_threads <= effective_n_threads + # + ITYPE_t effective_n_threads + ITYPE_t chunks_n_threads + + ITYPE_t n_samples_chunk, chunk_size + + ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder + ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder + + bint execute_in_parallel_on_Y + + @classmethod + def valid_metrics(cls) -> List[str]: + excluded = { + "pyfunc", # is relatively slow because we need to coerce data as np arrays + "mahalanobis", # is numerically unstable + # TODO: In order to support discrete distance metrics, we need to have a + # simultaneous sort which breaks ties on indices when distances are identical. + # The best might be using std::stable_sort and a Comparator taking an + # Arrays of Structures instead of Structure of Arrays (currently used). + "hamming", + *BOOL_METRICS, + } + return sorted(set(METRIC_MAPPING.keys()).difference(excluded)) + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + """Return True if the PairwiseDistancesReduction can be used for the given parameters. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) + Input data. + + Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) + Input data. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + Returns + ------- + True if the PairwiseDistancesReduction can be used, else False. + """ + # Coercing to np.array to get the dtype + # TODO: what is the best way to get lists' dtype? + X = np.asarray(X) if not isinstance(X, (np.ndarray, scipy.sparse.spmatrix)) else X + Y = np.asarray(Y) if not isinstance(Y, (np.ndarray, scipy.sparse.spmatrix)) else Y + # TODO: support sparse arrays and 32 bits + return (not issparse(X) and X.dtype == np.float64 and X.ndim == 2 and + not issparse(Y) and Y.dtype == np.float64 and Y.ndim == 2 and + metric in cls.valid_metrics()) + + def __init__( + self, + DatasetsPair datasets_pair, + chunk_size=None, + n_threads=None, + strategy=None, + ): + cdef: + ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks + + if chunk_size is None: + chunk_size = get_config().get("pairwise_dist_chunk_size", 256) + + self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20) + + self.effective_n_threads = _openmp_effective_n_threads(n_threads) + + self.datasets_pair = datasets_pair + + self.n_samples_X = datasets_pair.n_samples_X() + self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size) + X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk + self.X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk + + self.n_samples_Y = datasets_pair.n_samples_Y() + self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size) + Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk + self.Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk + + # Counting remainder chunk in total number of chunks + self.X_n_chunks = X_n_full_chunks + (self.X_n_samples_remainder != 0) + self.Y_n_chunks = Y_n_full_chunks + (self.Y_n_samples_remainder != 0) + + if strategy is None: + strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto') + + if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'): + raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', " + f"or 'auto', but currently strategy='{self.strategy}'.") + + if strategy == 'auto': + # This is a simple heuristic whose constant for the + # comparison has been chosen based on experiments. + if 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X: + strategy = 'parallel_on_X' + else: + strategy = 'parallel_on_Y' + + self.execute_in_parallel_on_Y = strategy == "parallel_on_Y" + + # Not using less, not using more. + self.chunks_n_threads = min( + self.Y_n_chunks if self.execute_in_parallel_on_Y else self.X_n_chunks, + self.effective_n_threads, + ) + + @final + cdef void _parallel_on_X(self) nogil: + """Compute the pairwise distances of each vector (row) of X on Y + by parallelizing computation on chunks of X and reduce them. + + This strategy dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + + Private datastructures are modified internally by threads. + + Private template methods can be implemented on subclasses to + interact with those datastructures at various stages. + """ + cdef: + ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx + ITYPE_t thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + thread_num = _openmp_thread_num() + + # Allocating thread datastructures + self._parallel_on_X_parallel_init(thread_num) + + for X_chunk_idx in prange(self.X_n_chunks, schedule='static'): + X_start = X_chunk_idx * self.X_n_samples_chunk + if (X_chunk_idx == self.X_n_chunks - 1 + and self.X_n_samples_remainder > 0): + X_end = X_start + self.X_n_samples_remainder + else: + X_end = X_start + self.X_n_samples_chunk + + # Reinitializing thread datastructures for the new X chunk + self._parallel_on_X_init_chunk(thread_num, X_start) + + for Y_chunk_idx in range(self.Y_n_chunks): + Y_start = Y_chunk_idx * self.Y_n_samples_chunk + if (Y_chunk_idx == self.Y_n_chunks - 1 + and self.Y_n_samples_remainder > 0): + Y_end = Y_start + self.Y_n_samples_remainder + else: + Y_end = Y_start + self.Y_n_samples_chunk + + self._compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + + # Adjusting thread datastructures on the full pass on Y + self._parallel_on_X_prange_iter_finalize(thread_num, X_start, X_end) + + # end: for X_chunk_idx + + # Deallocating thread datastructures + self._parallel_on_X_parallel_finalize(thread_num) + + # end: with nogil, parallel + return + + @final + cdef void _parallel_on_Y(self) nogil: + """Compute the pairwise distances of each vector (row) of X on Y + by parallelizing computation on chunks of Y and reduce them. + + This strategy dispatches chunks of Y uniformly on threads. + Each thread then iterates on all the chunks of X. This strategy is + embarrassingly parallel but uses intermediate datastructures + synchronisation. + + Private datastructures are modified internally by threads. + + Private template methods can be implemented on subclasses to + interact with those datastructures at various stages. + """ + cdef: + ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx + ITYPE_t thread_num + + # Allocating datastructures + self._parallel_on_Y_parallel_init() + + for X_chunk_idx in range(self.X_n_chunks): + X_start = X_chunk_idx * self.X_n_samples_chunk + if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_remainder > 0: + X_end = X_start + self.X_n_samples_remainder + else: + X_end = X_start + self.X_n_samples_chunk + + with nogil, parallel(num_threads=self.chunks_n_threads): + thread_num = _openmp_thread_num() + + # Initializing datastructures used in this thread + self._parallel_on_Y_init(thread_num) + + for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'): + Y_start = Y_chunk_idx * self.Y_n_samples_chunk + if Y_chunk_idx == self.Y_n_chunks - 1 \ + and self.Y_n_samples_remainder > 0: + Y_end = Y_start + self.Y_n_samples_remainder + else: + Y_end = Y_start + self.Y_n_samples_chunk + + self._compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + # end: prange + + # Note: we don't need a _parallel_on_Y_finalize similarly. + # This can be introduced if needed. + + # end: with nogil, parallel + + # Synchronizing the thread datastructures with the main ones + self._parallel_on_Y_synchronize(X_start, X_end) + + # end: for X_chunk_idx + # Deallocating temporary datastructures and adjusting main datastructures + self._parallel_on_Y_finalize() + return + + # Placeholder methods which have to be implemented + + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + """Compute the pairwise distances on two chunks of X and Y and reduce them. + + This is the core critical region of PairwiseDistanceReductions' computations + which must be implemented in subclasses. + """ + return + + def _finalize_results(self, bint return_distance): + """Call-back adapting datastructures before returning results. + + This must be implemented in subclasses. + """ + return None + + # Placeholder methods which can be implemented + + cdef void compute_exact_distances(self) nogil: + """Convert rank-preserving distances to exact distances or recompute them.""" + return + + cdef void _parallel_on_X_parallel_init( + self, + ITYPE_t thread_num, + ) nogil: + """Allocate datastructures used in a thread given its number.""" + return + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ) nogil: + """Initialise datastructures used in a thread given its number.""" + return + + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + """Interact with datastructures after a reduction on chunks.""" + return + + cdef void _parallel_on_X_parallel_finalize( + self, + ITYPE_t thread_num + ) nogil: + """Interact with datastructures after executing all the reductions.""" + return + + cdef void _parallel_on_Y_parallel_init( + self, + ) nogil: + """Allocate datastructures used in all threads.""" + return + + cdef void _parallel_on_Y_init( + self, + ITYPE_t thread_num, + ) nogil: + """Initialise datastructures used in a thread given its number.""" + return + + cdef void _parallel_on_Y_synchronize( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + """Update thread datastructures before leaving a parallel region.""" + return + + cdef void _parallel_on_Y_finalize( + self, + ) nogil: + """Update datastructures after executing all the reductions.""" + return + +cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): + """Compute the argkmin of vectors (rows) of X on the ones of Y. + + Parameters + ---------- + datasets_pair: DatasetsPair + The dataset pairs (X, Y) for the reduction. + + k: int + The k for the argkmin reduction. + + chunk_size: int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + n_threads: int, default=None + The number of OpenMP threads to use for the reduction. + Parallelism is done on chunks and the sharding of chunks + depends on the `strategy` set on :method:`~ArgKmin.compute`. + + See _openmp_effective_n_threads, for details about + the specification of n_threads. + """ + + cdef: + ITYPE_t k + + ITYPE_t[:, ::1] argkmin_indices + DTYPE_t[:, ::1] argkmin_distances + + # Used as array of pointers to private datastructures used in threads. + DTYPE_t ** heaps_r_distances_chunks + ITYPE_t ** heaps_indices_chunks + + @classmethod + def compute( + cls, + X, + Y, + ITYPE_t k, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + n_threads=None, + str strategy=None, + bint return_distance=False, + ): + """Return the results of the reduction for the given arguments. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + k : int + The k for the argkmin reduction. + + metric : str, default='euclidean' + The distance metric to use for argkmin. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + n_threads : int, default=None + The number of OpenMP threads to use for the reduction. + Parallelism is done on chunks and the sharding of chunks + depends on the `strategy` set on + :method:`~PairwiseDistancesArgKmin.compute`. + + See _openmp_effective_n_threads, for details about + the specification of n_threads. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + Strategies differs on the dispatching they use for chunks on threads: + + - 'parallel_on__X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread then iterates on all the chunks of X. This strategy is + embarrassingly parallel but uses intermediate datastructures + synchronisation. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on__X' and 'parallel_on_Y'. + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + return_distance : boolean, default=False + Return distances between each X vector and its + argkmin if set to True. + + Returns + ------- + Indices of argkmin for each vector in X and its associated distances + if return_distance=True. + + Notes + ----- + This public classmethod is responsible for introspecting the arguments + values to dispatch to the private :meth:`PairwiseDistancesArgKmin._compute` + instance method of the most appropriate :class:`PairwiseDistancesArgKmin` + concrete implementation. + + All temporarily allocated datastructures necessary for the concrete + implementation are therefore freed when this classmethod returns. + + This allows entirely decoupling the interface entirely from the + implementation details whilst maintaining RAII. + """ + # Note (jjerphan): Some design thoughts for future extensions. + # This factory comes to handle specialisations for the given arguments. + # For future work, this might can be an entrypoint to specialise operations + # for various back-end and/or hardware and/or datatypes, and/or fused + # {sparse, dense}-datasetspair etc. + + pda = PairwiseDistancesArgKmin( + datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), + k=k, + chunk_size=chunk_size, + strategy=strategy, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with threadpool_limits(limits=1, user_api="blas"): + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results(return_distance) + + def __init__( + self, + DatasetsPair datasets_pair, + ITYPE_t k, + chunk_size=None, + n_threads=None, + strategy=None, + ): + super().__init__(datasets_pair, chunk_size, n_threads, strategy) + + self.k = check_scalar(k, "k", Integral, min_val=1) + + # Allocating pointers to datastructures but not the datastructures themselves. + # There are as many pointers as effective threads. + # + # For the sake of explicitness: + # - when parallelizing on X, those heaps pointers are referencing + # (with proper offsets) addresses of the two main heaps (see bellow) + # - when parallelizing on Y, those heaps pointer heaps are referencing + # small heaps which are thread-wise-allocated and whose content will be + # merged with the main heaps'. + self.heaps_r_distances_chunks = malloc( + sizeof(DTYPE_t *) * self.chunks_n_threads + ) + self.heaps_indices_chunks = malloc( + sizeof(ITYPE_t *) * self.chunks_n_threads + ) + + # Main heaps used by PairwiseDistancesArgKmin._compute to return results. + self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE) + self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE) + + def __dealloc__(self): + if self.heaps_indices_chunks is not NULL: + free(self.heaps_indices_chunks) + + if self.heaps_r_distances_chunks is not NULL: + free(self.heaps_r_distances_chunks) + + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_samples_X = X_end - X_start + ITYPE_t n_samples_Y = Y_end - Y_start + DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num] + ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num] + + # Pushing the distance and their associated indices on heaps + # which keep tracks of the argkmin. + for i in range(n_samples_X): + for j in range(n_samples_Y): + heap_push( + heaps_r_distances + i * self.k, + heaps_indices + i * self.k, + self.k, + self.datasets_pair.surrogate_dist(X_start + i, Y_start + j), + Y_start + j, + ) + + @final + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ) nogil: + # As this strategy is embarrassingly parallel, we can set each + # thread's heaps pointer to the proper position on the main heaps. + self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0] + self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0] + + @final + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, jdx + + # Sorting indices of the argkmin for each query vector of X + for idx in range(X_end - X_start): + simultaneous_sort( + self.heaps_r_distances_chunks[thread_num] + idx * self.k, + self.heaps_indices_chunks[thread_num] + idx * self.k, + self.k + ) + + cdef void _parallel_on_Y_parallel_init( + self, + ) nogil: + cdef: + # Maximum number of scalar elements (the last chunks can be smaller) + ITYPE_t heaps_size = self.X_n_samples_chunk * self.k + ITYPE_t thread_num + + # The allocation is done in parallel for data locality purposes: this way + # the heaps used in each threads are allocated in pages which are closer + # to processor core used by the thread. + for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True, + num_threads=self.chunks_n_threads): + # As chunks of X are shared across threads, so must their + # heaps. To solve this, each thread has its own heaps + # which are then synchronised back in the main ones. + self.heaps_r_distances_chunks[thread_num] = malloc( + heaps_size * sizeof(DTYPE_t) + ) + self.heaps_indices_chunks[thread_num] = malloc( + heaps_size * sizeof(ITYPE_t) + ) + + @final + cdef void _parallel_on_Y_init( + self, + ITYPE_t thread_num, + ) nogil: + # Initialising heaps (memset can't be used here) + for idx in range(self.X_n_samples_chunk * self.k): + self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX + self.heaps_indices_chunks[thread_num][idx] = -1 + + @final + cdef void _parallel_on_Y_synchronize( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, jdx, thread_num + with nogil, parallel(num_threads=self.effective_n_threads): + # Synchronising the thread heaps with the main heaps. + # This is done in parallel sample-wise (no need for locks). + # This might break each thread's data locality a bit but + # but this is negligible and this parallel pattern has + # shown to be efficient in practice. + for idx in prange(X_end - X_start, schedule="static"): + for thread_num in range(self.chunks_n_threads): + for jdx in range(self.k): + heap_push( + &self.argkmin_distances[X_start + idx, 0], + &self.argkmin_indices[X_start + idx, 0], + self.k, + self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx], + self.heaps_indices_chunks[thread_num][idx * self.k + jdx], + ) + + cdef void _parallel_on_Y_finalize( + self, + ) nogil: + cdef: + ITYPE_t idx, thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + # Deallocating temporary datastructures + for thread_num in prange(self.chunks_n_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + + # Sort the main heaps into arrays in parallel + # in ascending order w.r.t the distances + for idx in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + &self.argkmin_distances[idx, 0], + &self.argkmin_indices[idx, 0], + self.k, + ) + return + + cdef void compute_exact_distances(self) nogil: + cdef: + ITYPE_t i, j + ITYPE_t[:, ::1] Y_indices = self.argkmin_indices + DTYPE_t[:, ::1] distances = self.argkmin_distances + for i in prange(self.n_samples_X, schedule='static', nogil=True, + num_threads=self.effective_n_threads): + for j in range(self.k): + distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist( + # Guard against eventual -0., causing nan production. + max(distances[i, j], 0.) + ) + + def _finalize_results(self, bint return_distance=False): + if return_distance: + # We need to recompute distances because we relied on + # surrogate distances for the reduction. + self.compute_exact_distances() + return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices) + + return np.asarray(self.argkmin_indices) diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py index 69925a3590be6..2bf582506922e 100644 --- a/sklearn/metrics/setup.py +++ b/sklearn/metrics/setup.py @@ -26,6 +26,12 @@ def configuration(parent_package="", top_path=None): libraries=libraries, ) + config.add_extension( + "_pairwise_distances_reduction", + sources=["_pairwise_distances_reduction.pyx"], + libraries=libraries, + ) + config.add_subpackage("tests") return config diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py new file mode 100644 index 0000000000000..c6efeb8259a20 --- /dev/null +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -0,0 +1,379 @@ +import numpy as np +import pytest +from collections import defaultdict +from numpy.testing import assert_array_equal, assert_allclose +from scipy.sparse import csr_matrix + +from sklearn.metrics._pairwise_distances_reduction import ( + PairwiseDistancesReduction, + PairwiseDistancesArgKmin, + _sqeuclidean_row_norms, +) + +from sklearn.utils import _in_unstable_openblas_configuration +from sklearn.utils.fixes import sp_version, parse_version +from sklearn.utils._testing import fails_if_unstable_openblas + + +def _get_dummy_metric_params_list(metric: str, n_features: int): + """Return list of dummy DistanceMetric kwargs for tests.""" + + rng = np.random.RandomState(1) + weights = rng.random_sample(n_features) + weights /= weights.sum() + + V = rng.random_sample((n_features, n_features)) + + # VI is positive-semidefinite, preferred for precision matrix + VI = np.dot(V, V.T) + 3 * np.eye(n_features) + + METRICS_PARAMS = defaultdict( + list, + { + "euclidean": [{}], + "manhattan": [{}], + "minkowski": [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)], + "chebyshev": [{}], + "seuclidean": [dict(V=rng.rand(n_features))], + "haversine": [{}], + "wminkowski": [dict(p=1.5, w=weights)], + "mahalanobis": [dict(VI=VI)], + }, + ) + + wminkowski_kwargs = dict(p=3, w=rng.rand(n_features)) + + if sp_version < parse_version("1.8.0.dev0"): + # TODO: remove once we no longer support scipy < 1.8.0. + # wminkowski was removed in scipy 1.8.0 but should work for previous + # versions. + METRICS_PARAMS["wminkowski"].append(wminkowski_kwargs) # type: ignore + else: + # Recent scipy versions accept weights in the Minkowski metric directly: + # type: ignore + METRICS_PARAMS["minkowski"].append(wminkowski_kwargs) # type: ignore + + return METRICS_PARAMS.get(metric, [{}]) + + +def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices): + assert_array_equal( + ref_indices, + indices, + err_msg="Query vectors have different neighbors' indices", + ) + assert_allclose( + ref_dist, + dist, + err_msg="Query vectors have different neighbors' distances", + rtol=1e-7, + ) + + +ASSERT_RESULT = { + PairwiseDistancesArgKmin: assert_argkmin_results_equality, +} + + +def test_pairwise_distances_reduction_is_usable_for(): + rng = np.random.RandomState(0) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + metric = "euclidean" + assert PairwiseDistancesReduction.is_usable_for(X, Y, metric) + assert not PairwiseDistancesReduction.is_usable_for( + X.astype(np.int64), Y.astype(np.int64), metric + ) + + assert not PairwiseDistancesReduction.is_usable_for(X[0], Y, metric) + assert not PairwiseDistancesReduction.is_usable_for(X, Y[0], metric) + + assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc") + # TODO: remove once 32 bits datasets are supported + assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric) + assert not PairwiseDistancesReduction.is_usable_for(X, Y.astype(np.int32), metric) + + # TODO: remove once sparse matrices are supported + assert not PairwiseDistancesReduction.is_usable_for(csr_matrix(X), Y, metric) + assert not PairwiseDistancesReduction.is_usable_for(X, csr_matrix(Y), metric) + + +def test_argkmin_factory_method_wrong_usages(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + k = 5 + metric = "euclidean" + + with pytest.raises( + ValueError, match="Only 64bit float datasets are supported for X and Y." + ): + PairwiseDistancesArgKmin.compute( + X=X.astype(np.float32), Y=Y, k=k, metric=metric + ) + + with pytest.raises( + ValueError, match="Only 64bit float datasets are supported for X and Y." + ): + PairwiseDistancesArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric) + + with pytest.raises(ValueError, match="k == -1, must be >= 1."): + PairwiseDistancesArgKmin.compute(X=X, Y=Y, k=-1, metric=metric) + + with pytest.raises(ValueError, match="k == 0, must be >= 1."): + PairwiseDistancesArgKmin.compute(X=X, Y=Y, k=0, metric=metric) + + with pytest.raises(ValueError, match="Unrecognized metric"): + PairwiseDistancesArgKmin.compute(X=X, Y=Y, k=k, metric="wrong metric") + + with pytest.raises( + ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)" + ): + PairwiseDistancesArgKmin.compute( + X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric + ) + + with pytest.raises(ValueError, match="ndarray is not C-contiguous"): + PairwiseDistancesArgKmin.compute( + X=np.asfortranarray(X), Y=Y, k=k, metric=metric + ) + + +@fails_if_unstable_openblas +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize("n_samples", [100, 1000]) +@pytest.mark.parametrize("chunk_size", [50, 512, 1024]) +@pytest.mark.parametrize( + "PairwiseDistancesReduction", + [PairwiseDistancesArgKmin], +) +def test_chunk_size_agnosticism( + PairwiseDistancesReduction, + seed, + n_samples, + chunk_size, + n_features=100, + dtype=np.float64, +): + # Results should not depend on the chunk size + rng = np.random.RandomState(seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + parameter = ( + 10 + if PairwiseDistancesReduction is PairwiseDistancesArgKmin + # Scaling the radius slightly with the numbers of dimensions + else 10 ** np.log(n_features) + ) + + ref_dist, ref_indices = PairwiseDistancesReduction.compute( + X, + Y, + parameter, + return_distance=True, + ) + + dist, indices = PairwiseDistancesReduction.compute( + X, + Y, + parameter, + chunk_size=chunk_size, + return_distance=True, + ) + + ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices) + + +@fails_if_unstable_openblas +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize("n_samples", [100, 1000]) +@pytest.mark.parametrize("chunk_size", [50, 512, 1024]) +@pytest.mark.parametrize( + "PairwiseDistancesReduction", + [PairwiseDistancesArgKmin], +) +def test_n_threads_agnosticism( + PairwiseDistancesReduction, + seed, + n_samples, + chunk_size, + n_features=100, + dtype=np.float64, +): + # Results should not depend on the number of threads + rng = np.random.RandomState(seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + parameter = ( + 10 + if PairwiseDistancesReduction is PairwiseDistancesArgKmin + # Scaling the radius slightly with the numbers of dimensions + else 10 ** np.log(n_features) + ) + + ref_dist, ref_indices = PairwiseDistancesReduction.compute( + X, + Y, + parameter, + return_distance=True, + ) + + dist, indices = PairwiseDistancesReduction.compute( + X, Y, parameter, n_threads=1, return_distance=True + ) + + ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices) + + +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize("n_samples", [100, 1000]) +@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics()) +@pytest.mark.parametrize( + "PairwiseDistancesReduction", + [PairwiseDistancesArgKmin], +) +def test_strategies_consistency( + PairwiseDistancesReduction, + metric, + n_samples, + seed, + n_features=10, + dtype=np.float64, +): + # Results obtained using both parallelization strategies must be identical + if _in_unstable_openblas_configuration() and metric in ("sqeuclidean", "euclidean"): + pytest.xfail( + "OpenBLAS (used for '(sq)euclidean') is unstable in this configuration" + ) + + rng = np.random.RandomState(seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + # Haversine distance only accepts 2D data + if metric == "haversine": + X = np.ascontiguousarray(X[:, :2]) + Y = np.ascontiguousarray(Y[:, :2]) + + parameter = ( + 10 + if PairwiseDistancesReduction is PairwiseDistancesArgKmin + # Scaling the radius slightly with the numbers of dimensions + else 10 ** np.log(n_features) + ) + + dist_par_X, indices_par_X = PairwiseDistancesReduction.compute( + X, + Y, + parameter, + metric=metric, + # Taking the first + metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0], + # To be sure to use parallelization + chunk_size=n_samples // 4, + strategy="parallel_on_X", + return_distance=True, + ) + + dist_par_Y, indices_par_Y = PairwiseDistancesReduction.compute( + X, + Y, + parameter, + metric=metric, + # Taking the first + metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0], + # To be sure to use parallelization + chunk_size=n_samples // 4, + strategy="parallel_on_Y", + return_distance=True, + ) + + ASSERT_RESULT[PairwiseDistancesReduction]( + dist_par_X, + dist_par_Y, + indices_par_X, + indices_par_Y, + ) + + +@fails_if_unstable_openblas +@pytest.mark.parametrize("n_features", [50, 500]) +@pytest.mark.parametrize("translation", [10 ** i for i in [4, 8]]) +@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics()) +@pytest.mark.parametrize( + "PairwiseDistancesReduction", + [PairwiseDistancesArgKmin], +) +def test_euclidean_translation_invariance( + n_features, + translation, + metric, + PairwiseDistancesReduction, + n_samples=1000, + dtype=np.float64, +): + # The reduction must be translation invariant. + parameter = ( + 10 + if PairwiseDistancesReduction is PairwiseDistancesArgKmin + # Scaling the radius slightly with the numbers of dimensions + else 10 ** np.log(n_features) + ) + + rng = np.random.RandomState(0) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + # Haversine distance only accepts 2D data + if metric == "haversine": + X = np.ascontiguousarray(X[:, :2]) + Y = np.ascontiguousarray(Y[:, :2]) + + reference_dist, reference_indices = PairwiseDistancesReduction.compute( + X, + Y, + parameter, + metric=metric, + metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0], + return_distance=True, + ) + + dist, indices = PairwiseDistancesReduction.compute( + X + 0, + Y + 0, + parameter, + metric=metric, + metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0], + return_distance=True, + ) + + ASSERT_RESULT[PairwiseDistancesReduction]( + reference_dist, dist, reference_indices, indices + ) + + +@pytest.mark.parametrize("seed", range(10)) +@pytest.mark.parametrize("n_samples", [100, 1000]) +@pytest.mark.parametrize("n_features", [5, 10, 100]) +@pytest.mark.parametrize("num_threads", [1, 2, 8]) +def test_sqeuclidean_row_norms( + seed, + n_samples, + n_features, + num_threads, + dtype=np.float64, +): + rng = np.random.RandomState(seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + + sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2 + sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads)) + + assert_allclose(sq_row_norm_reference, sq_row_norm) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 3d8a1ca87d210..4b2261ad7c2f4 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -26,7 +26,7 @@ from . import _joblib from ..exceptions import DataConversionWarning from .deprecation import deprecated -from .fixes import np_version, parse_version +from .fixes import np_version, parse_version, threadpool_info from ._estimator_html_repr import estimator_html_repr from .validation import ( as_float_array, @@ -81,6 +81,39 @@ _IS_32BIT = 8 * struct.calcsize("P") == 32 +def _in_unstable_openblas_configuration(): + """Return True if in an unstable configuration for OpenBLAS""" + + # Import libraries which might load OpenBLAS. + import numpy # noqa + import scipy # noqa + + modules_info = threadpool_info() + + open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info) + if not open_blas_used: + return False + + # OpenBLAS 0.3.16 fixed unstability for arm64, see: + # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa + openblas_arm64_stable_version = parse_version("0.3.16") + for info in modules_info: + if info["internal_api"] != "openblas": + continue + openblas_version = info.get("version") + openblas_architecture = info.get("architecture") + if openblas_version is None or openblas_architecture is None: + # Cannot be sure that OpenBLAS is good enough. Assume unstable: + return True + if ( + openblas_architecture == "neoversen1" + and parse_version(openblas_version) < openblas_arm64_stable_version + ): + # See discussions in https://github.com/numpy/numpy/issues/19411 + return True + return False + + class Bunch(dict): """Container object exposing keys as attributes. diff --git a/sklearn/utils/_openmp_helpers.pxd b/sklearn/utils/_openmp_helpers.pxd new file mode 100644 index 0000000000000..e57fc9bfa6bf5 --- /dev/null +++ b/sklearn/utils/_openmp_helpers.pxd @@ -0,0 +1,6 @@ +# Helpers to access OpenMP threads information +# +# Those interfaces act as indirections which allows the non-support of OpenMP +# for implementations which have been written for it. + +cdef int _openmp_thread_num() nogil diff --git a/sklearn/utils/_openmp_helpers.pyx b/sklearn/utils/_openmp_helpers.pyx index fb8920074a84e..cddd77ac42746 100644 --- a/sklearn/utils/_openmp_helpers.pyx +++ b/sklearn/utils/_openmp_helpers.pyx @@ -6,7 +6,7 @@ IF SKLEARN_OPENMP_PARALLELISM_ENABLED: def _openmp_parallelism_enabled(): """Determines whether scikit-learn has been built with OpenMP - + It allows to retrieve at runtime the information gathered at compile time. """ # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time during @@ -22,7 +22,7 @@ cpdef _openmp_effective_n_threads(n_threads=None): - if the ``OMP_NUM_THREADS`` environment variable is set, return ``openmp.omp_get_max_threads()`` - otherwise, return the minimum between ``openmp.omp_get_max_threads()`` - and the number of cpus, taking cgroups quotas into account. Cgroups + and the number of cpus, taking cgroups quotas into account. Cgroups quotas can typically be set by tools such as Docker. The result of ``omp_get_max_threads`` can be influenced by environment variable ``OMP_NUM_THREADS`` or at runtime by ``omp_set_num_threads``. @@ -59,4 +59,13 @@ cpdef _openmp_effective_n_threads(n_threads=None): # OpenMP disabled at build-time => sequential mode return 1 - + +cdef inline int _openmp_thread_num() nogil: + """Return the number of the thread calling this function. + + If scikit-learn is built without OpenMP support, always return 0. + """ + IF SKLEARN_OPENMP_PARALLELISM_ENABLED: + return openmp.omp_get_thread_num() + ELSE: + return 0 diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 1724063be2f43..6f58ce3f3b7b4 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -48,7 +48,12 @@ import joblib import sklearn -from sklearn.utils import IS_PYPY, _IS_32BIT, deprecated +from sklearn.utils import ( + IS_PYPY, + _IS_32BIT, + deprecated, + _in_unstable_openblas_configuration, +) from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import ( check_array, @@ -448,6 +453,10 @@ def set_random_state(estimator, random_state=0): os.environ.get("TRAVIS") == "true", reason="skip on travis" ) fails_if_pypy = pytest.mark.xfail(IS_PYPY, reason="not compatible with PyPy") + fails_if_unstable_openblas = pytest.mark.xfail( + _in_unstable_openblas_configuration(), + reason="OpenBLAS is unstable for this configuration", + ) skip_if_no_parallel = pytest.mark.skipif( not joblib.parallel.mp, reason="joblib is in serial mode" ) From 14106c484e856c0e7466455d295c9229d952a5c8 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Dec 2021 10:48:38 +0100 Subject: [PATCH 02/22] Retrigger CI for failing Circle CI job From 3cdd3a5ad099326b1bd2c8601d7360272c10f7e4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Dec 2021 11:15:45 +0100 Subject: [PATCH 03/22] TST Improve _get_dummy_metric_params_list Co-authored-by: Thomas J. Fan --- .../test_pairwise_distances_reduction.py | 71 ++++++++++--------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index c6efeb8259a20..89d012ac148ee 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1,6 +1,5 @@ import numpy as np import pytest -from collections import defaultdict from numpy.testing import assert_array_equal, assert_allclose from scipy.sparse import csr_matrix @@ -18,42 +17,44 @@ def _get_dummy_metric_params_list(metric: str, n_features: int): """Return list of dummy DistanceMetric kwargs for tests.""" + # Distinguishing on cases not to compute unneeded datastructures. rng = np.random.RandomState(1) - weights = rng.random_sample(n_features) - weights /= weights.sum() - - V = rng.random_sample((n_features, n_features)) - - # VI is positive-semidefinite, preferred for precision matrix - VI = np.dot(V, V.T) + 3 * np.eye(n_features) - - METRICS_PARAMS = defaultdict( - list, - { - "euclidean": [{}], - "manhattan": [{}], - "minkowski": [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)], - "chebyshev": [{}], - "seuclidean": [dict(V=rng.rand(n_features))], - "haversine": [{}], - "wminkowski": [dict(p=1.5, w=weights)], - "mahalanobis": [dict(VI=VI)], - }, - ) - - wminkowski_kwargs = dict(p=3, w=rng.rand(n_features)) - - if sp_version < parse_version("1.8.0.dev0"): - # TODO: remove once we no longer support scipy < 1.8.0. - # wminkowski was removed in scipy 1.8.0 but should work for previous - # versions. - METRICS_PARAMS["wminkowski"].append(wminkowski_kwargs) # type: ignore - else: - # Recent scipy versions accept weights in the Minkowski metric directly: - # type: ignore - METRICS_PARAMS["minkowski"].append(wminkowski_kwargs) # type: ignore - return METRICS_PARAMS.get(metric, [{}]) + if metric == "minkowski": + minkowski_kwargs = [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)] + if sp_version >= parse_version("1.8.0.dev0"): + # TODO: remove the test once we no longer support scipy < 1.8.0. + # Recent scipy versions accept weights in the Minkowski metric directly: + # type: ignore + minkowski_kwargs.append(dict(p=3, w=rng.rand(n_features))) + + return minkowski_kwargs + + # TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0. + if metric == "wminkowski": + weights = rng.random_sample(n_features) + weights /= weights.sum() + wminkowski_kwargs = [dict(p=1.5, w=weights)] + if sp_version < parse_version("1.8.0.dev0"): + # wminkowski was removed in scipy 1.8.0 but should work for previous + # versions. + wminkowski_kwargs.append(dict(p=3, w=rng.rand(n_features))) + return wminkowski_kwargs + + if metric == "seuclidean": + return [dict(V=rng.rand(n_features))] + + if metric == "mahalanobis": + V = rng.random_sample((n_features, n_features)) + # This makes VI is positive-semidefinite, which is a + # necessary condition to get nonsingular precision matrix. + VI = np.dot(V, V.T) + 3 * np.eye(n_features) + + return [dict(VI=VI)] + + # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric. + # In those cases, no kwargs is needed. + return [{}] def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices): From 31db785b792a43b0064fc51fbb43c7709c4310bb Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Dec 2021 13:10:25 +0100 Subject: [PATCH 04/22] Address review comments Co-authored-by: Christian Lorentzen --- sklearn/metrics/_dist_metrics.pyx | 26 ++++++++++++++------------ sklearn/metrics/setup.py | 1 + 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index 3def08da7965c..6d090bdebafe5 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -1184,7 +1184,8 @@ cdef class DatasetsPair: aggregation logic from metric-specific computation as much as possible. - X and Y can be stored as np.ndarrays or CSR matrices in subclasses. + X and Y can be stored as C-contiguous np.ndarrays or CSR matrices + in subclasses. This class avoids the overhead of dispatching distance computations to :class:`sklearn.metrics.DistanceMetric` based on the physical @@ -1240,7 +1241,7 @@ cdef class DatasetsPair: **(metric_kwargs or {}) ) - if X.dtype != np.float64 or Y.dtype != np.float64: + if not(X.dtype == Y.dtype == np.float64): raise ValueError("Only 64bit float datasets are supported for X and Y.") # Metric-specific checks that do not replace nor duplicate `check_array`. @@ -1252,34 +1253,35 @@ cdef class DatasetsPair: return DenseDenseDatasetsPair(X, Y, distance_metric) - @classmethod - def unpack_csr_matrix(cls, X: csr_matrix): - """Ensure getting ITYPE instead of int internally used for CSR matrices.""" - X_data = np.asarray(X.data, dtype=DTYPE) - X_indices = np.asarray(X.indices, dtype=ITYPE) - X_indptr = np.asarray(X.indptr, dtype=ITYPE) - return X_data, X_indptr, X_indptr - def __init__(self, DistanceMetric distance_metric): self.distance_metric = distance_metric cdef ITYPE_t n_samples_X(self) nogil: """Number of samples in X.""" + # This is a abstract method. + # This _must_ always be overwritten in subclasses. + # TODO: add "with gil: raise" here when supporting Cython 3.0 return -999 cdef ITYPE_t n_samples_Y(self) nogil: """Number of samples in Y.""" + # This is a abstract method. + # This _must_ always be overwritten in subclasses. + # TODO: add "with gil: raise" here when supporting Cython 3.0 return -999 cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil: return self.dist(i, j) cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + # This is a abstract method. + # This _must_ always be overwritten in subclasses. + # TODO: add "with gil: raise" here when supporting Cython 3.0 return -1 @final cdef class DenseDenseDatasetsPair(DatasetsPair): - """Compute distances between vectors of two arrays. + """Compute distances between row vectors of two arrays. Parameters ---------- @@ -1291,7 +1293,7 @@ cdef class DenseDenseDatasetsPair(DatasetsPair): distance_metric: DistanceMetric The distance metric responsible for computing distances - between two vectors of (X, Y). + between two row vectors of (X, Y). """ def __init__(self, X, Y, DistanceMetric distance_metric): diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py index 2bf582506922e..1c26d9969397c 100644 --- a/sklearn/metrics/setup.py +++ b/sklearn/metrics/setup.py @@ -29,6 +29,7 @@ def configuration(parent_package="", top_path=None): config.add_extension( "_pairwise_distances_reduction", sources=["_pairwise_distances_reduction.pyx"], + include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")], libraries=libraries, ) From b60e8977dab871fad0c2054c43e9e52039c6a5c0 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Dec 2021 17:33:25 +0100 Subject: [PATCH 05/22] Address review comments Co-authored-by: Christian Lorentzen --- .../metrics/_pairwise_distances_reduction.pyx | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index d08b81d48a58c..40f3973ad225a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -4,8 +4,8 @@ # Author: Julien Jerphanion # # -# The routines defined here are used in various algorithms performing -# the same structure of operations on distances between vectors +# The abstractions defined here are used in various algorithms performing +# the same structure of operations on distances between row vectors # of a datasets pair (X, Y). # # Importantly, the core of the computation is chunked to make sure that the pairwise @@ -37,7 +37,7 @@ from ..utils._cython_blas cimport ( ) from ..utils._heap cimport simultaneous_sort, heap_push from ..utils._openmp_helpers cimport _openmp_thread_num -from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t +from ..utils._typedefs cimport ITYPE_t, DTYPE_t from numbers import Integral, Real from typing import List @@ -48,8 +48,10 @@ from ..utils.fixes import threadpool_limits from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils._typedefs import ITYPE, DTYPE + np.import_array() + cpdef DTYPE_t[::1] _sqeuclidean_row_norms( const DTYPE_t[:, ::1] X, ITYPE_t num_threads, @@ -62,26 +64,27 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms( # Casting for X to remove the const qualifier is needed because APIs # exposed via scipy.linalg.cython_blas aren't reflecting the arguments' # const qualifier. + # See: https://github.com/scipy/scipy/issues/1426 DTYPE_t * X_ptr = &X[0, 0] ITYPE_t idx = 0 ITYPE_t n = X.shape[0] ITYPE_t d = X.shape[1] - DTYPE_t[::1] row_norms = np.empty(n, dtype=DTYPE) + DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE) for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): - row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1) + squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1) - return row_norms + return squared_row_norms cdef class PairwiseDistancesReduction: - """Abstract base class for pairwise distance computation & reduction + """Abstract base class for pairwise distance computation & reduction. Subclasses of this class compute pairwise distances between a set of - vectors (rows) X and another set of vectors (rows) Y and apply a - reduction on top. The reduction takes a matrix of pairwise distances - between rows of X and Y as input and outputs an aggregate data-structure - for each row of X. The aggregate values are typically smaller than the number - of rows in Y, hence the term reduction. + row vectors of X and another set of row vectors pf Y and apply a reduction on top. + The reduction takes a matrix of pairwise distances between rows of X and Y + as input and outputs an aggregate data-structure for each row of X. + The aggregate values are typically smaller than the number of rows in Y, + hence the term reduction. For computational reasons, it is interesting to perform the reduction on the fly on chunks of rows of X and Y so as to keep intermediate @@ -104,7 +107,7 @@ cdef class PairwiseDistancesReduction: datasets_pair: DatasetsPair The pair of dataset to use. - chunk_size: int, default=None, + chunk_size: int, default=None The number of vectors per chunk. If None (default) looks-up in scikit-learn configuration for `pairwise_dist_chunk_size`, and use 256 if it is not set. @@ -176,7 +179,7 @@ cdef class PairwiseDistancesReduction: "hamming", *BOOL_METRICS, } - return sorted(set(METRIC_MAPPING.keys()).difference(excluded)) + return sorted(set(METRIC_MAPPING.keys()) - excluded) @classmethod def is_usable_for(cls, X, Y, metric) -> bool: @@ -266,7 +269,7 @@ cdef class PairwiseDistancesReduction: @final cdef void _parallel_on_X(self) nogil: - """Compute the pairwise distances of each vector (row) of X on Y + """Compute the pairwise distances of each row vector of X on Y by parallelizing computation on chunks of X and reduce them. This strategy dispatches chunks of X uniformly on threads. @@ -326,7 +329,7 @@ cdef class PairwiseDistancesReduction: @final cdef void _parallel_on_Y(self) nogil: - """Compute the pairwise distances of each vector (row) of X on Y + """Compute the pairwise distances of each row vector of X on Y by parallelizing computation on chunks of Y and reduce them. This strategy dispatches chunks of Y uniformly on threads. @@ -399,8 +402,8 @@ cdef class PairwiseDistancesReduction: ) nogil: """Compute the pairwise distances on two chunks of X and Y and reduce them. - This is the core critical region of PairwiseDistanceReductions' computations - which must be implemented in subclasses. + This is THE core computational method of PairwiseDistanceReductions. + This must be implemented in subclasses. """ return @@ -476,7 +479,7 @@ cdef class PairwiseDistancesReduction: return cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): - """Compute the argkmin of vectors (rows) of X on the ones of Y. + """Compute the argkmin of row vectors of X on the ones of Y. Parameters ---------- From 5d7ea09a96008732d6250915344374c992e4a7c7 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Dec 2021 17:07:10 +0100 Subject: [PATCH 06/22] DEBUG TST Try removing handling of unstable OpenBLAS configuration This was introduced once a long time ago for a failure which was happening in a single configuration (see the comments). Let's see if this has been fixed. Co-authored-by: Christian Lorentzen --- .../metrics/_pairwise_distances_reduction.pyx | 2 +- .../test_pairwise_distances_reduction.py | 10 ------ sklearn/utils/__init__.py | 35 +------------------ sklearn/utils/_testing.py | 11 +----- 4 files changed, 3 insertions(+), 55 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 40f3973ad225a..b2bee50174d1d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -43,7 +43,7 @@ from numbers import Integral, Real from typing import List from scipy.sparse import issparse from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING -from ..utils import check_scalar, _in_unstable_openblas_configuration +from ..utils import check_scalar from ..utils.fixes import threadpool_limits from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils._typedefs import ITYPE, DTYPE diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 89d012ac148ee..710ae9636494b 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -9,9 +9,7 @@ _sqeuclidean_row_norms, ) -from sklearn.utils import _in_unstable_openblas_configuration from sklearn.utils.fixes import sp_version, parse_version -from sklearn.utils._testing import fails_if_unstable_openblas def _get_dummy_metric_params_list(metric: str, n_features: int): @@ -140,7 +138,6 @@ def test_argkmin_factory_method_wrong_usages(): ) -@fails_if_unstable_openblas @pytest.mark.parametrize("seed", range(5)) @pytest.mark.parametrize("n_samples", [100, 1000]) @pytest.mark.parametrize("chunk_size", [50, 512, 1024]) @@ -187,7 +184,6 @@ def test_chunk_size_agnosticism( ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices) -@fails_if_unstable_openblas @pytest.mark.parametrize("seed", range(5)) @pytest.mark.parametrize("n_samples", [100, 1000]) @pytest.mark.parametrize("chunk_size", [50, 512, 1024]) @@ -245,11 +241,6 @@ def test_strategies_consistency( n_features=10, dtype=np.float64, ): - # Results obtained using both parallelization strategies must be identical - if _in_unstable_openblas_configuration() and metric in ("sqeuclidean", "euclidean"): - pytest.xfail( - "OpenBLAS (used for '(sq)euclidean') is unstable in this configuration" - ) rng = np.random.RandomState(seed) spread = 100 @@ -302,7 +293,6 @@ def test_strategies_consistency( ) -@fails_if_unstable_openblas @pytest.mark.parametrize("n_features", [50, 500]) @pytest.mark.parametrize("translation", [10 ** i for i in [4, 8]]) @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics()) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 4b2261ad7c2f4..3d8a1ca87d210 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -26,7 +26,7 @@ from . import _joblib from ..exceptions import DataConversionWarning from .deprecation import deprecated -from .fixes import np_version, parse_version, threadpool_info +from .fixes import np_version, parse_version from ._estimator_html_repr import estimator_html_repr from .validation import ( as_float_array, @@ -81,39 +81,6 @@ _IS_32BIT = 8 * struct.calcsize("P") == 32 -def _in_unstable_openblas_configuration(): - """Return True if in an unstable configuration for OpenBLAS""" - - # Import libraries which might load OpenBLAS. - import numpy # noqa - import scipy # noqa - - modules_info = threadpool_info() - - open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info) - if not open_blas_used: - return False - - # OpenBLAS 0.3.16 fixed unstability for arm64, see: - # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa - openblas_arm64_stable_version = parse_version("0.3.16") - for info in modules_info: - if info["internal_api"] != "openblas": - continue - openblas_version = info.get("version") - openblas_architecture = info.get("architecture") - if openblas_version is None or openblas_architecture is None: - # Cannot be sure that OpenBLAS is good enough. Assume unstable: - return True - if ( - openblas_architecture == "neoversen1" - and parse_version(openblas_version) < openblas_arm64_stable_version - ): - # See discussions in https://github.com/numpy/numpy/issues/19411 - return True - return False - - class Bunch(dict): """Container object exposing keys as attributes. diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 6f58ce3f3b7b4..1724063be2f43 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -48,12 +48,7 @@ import joblib import sklearn -from sklearn.utils import ( - IS_PYPY, - _IS_32BIT, - deprecated, - _in_unstable_openblas_configuration, -) +from sklearn.utils import IS_PYPY, _IS_32BIT, deprecated from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import ( check_array, @@ -453,10 +448,6 @@ def set_random_state(estimator, random_state=0): os.environ.get("TRAVIS") == "true", reason="skip on travis" ) fails_if_pypy = pytest.mark.xfail(IS_PYPY, reason="not compatible with PyPy") - fails_if_unstable_openblas = pytest.mark.xfail( - _in_unstable_openblas_configuration(), - reason="OpenBLAS is unstable for this configuration", - ) skip_if_no_parallel = pytest.mark.skipif( not joblib.parallel.mp, reason="joblib is in serial mode" ) From fb927e74bc8ac328cbfde3cbf75f2985ca837569 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 23 Dec 2021 18:28:42 +0100 Subject: [PATCH 07/22] TST Remove useless mahalanobis case --- .../metrics/tests/test_pairwise_distances_reduction.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 710ae9636494b..844f8e7138d20 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -42,14 +42,6 @@ def _get_dummy_metric_params_list(metric: str, n_features: int): if metric == "seuclidean": return [dict(V=rng.rand(n_features))] - if metric == "mahalanobis": - V = rng.random_sample((n_features, n_features)) - # This makes VI is positive-semidefinite, which is a - # necessary condition to get nonsingular precision matrix. - VI = np.dot(V, V.T) + 3 * np.eye(n_features) - - return [dict(VI=VI)] - # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric. # In those cases, no kwargs is needed. return [{}] From a2f7b6d11597388e95443325d692037ef6eb321c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 3 Jan 2022 10:00:36 +0100 Subject: [PATCH 08/22] Factor the logic for computing last chunks indices Co-authored-by: Christian Lorentzen Co-authored-by: Thomas J. Fan --- .../metrics/_pairwise_distances_reduction.pyx | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index b2bee50174d1d..f2d6715e81911 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -162,8 +162,8 @@ cdef class PairwiseDistancesReduction: ITYPE_t n_samples_chunk, chunk_size - ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder - ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder + ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk + ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk bint execute_in_parallel_on_Y @@ -233,16 +233,24 @@ cdef class PairwiseDistancesReduction: self.n_samples_X = datasets_pair.n_samples_X() self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size) X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk - self.X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk + X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk + self.X_n_chunks = X_n_full_chunks + (X_n_samples_remainder != 0) + + if X_n_samples_remainder != 0: + self.X_n_samples_last_chunk = X_n_samples_remainder + else: + self.X_n_samples_last_chunk = self.X_n_samples_chunk self.n_samples_Y = datasets_pair.n_samples_Y() self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size) Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk - self.Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk + Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk + self.Y_n_chunks = Y_n_full_chunks + (Y_n_samples_remainder != 0) - # Counting remainder chunk in total number of chunks - self.X_n_chunks = X_n_full_chunks + (self.X_n_samples_remainder != 0) - self.Y_n_chunks = Y_n_full_chunks + (self.Y_n_samples_remainder != 0) + if Y_n_samples_remainder != 0: + self.Y_n_samples_last_chunk = Y_n_samples_remainder + else: + self.Y_n_samples_last_chunk = self.Y_n_samples_chunk if strategy is None: strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto') @@ -293,9 +301,8 @@ cdef class PairwiseDistancesReduction: for X_chunk_idx in prange(self.X_n_chunks, schedule='static'): X_start = X_chunk_idx * self.X_n_samples_chunk - if (X_chunk_idx == self.X_n_chunks - 1 - and self.X_n_samples_remainder > 0): - X_end = X_start + self.X_n_samples_remainder + if X_chunk_idx == self.X_n_chunks - 1: + X_end = X_start + self.X_n_samples_last_chunk else: X_end = X_start + self.X_n_samples_chunk @@ -304,9 +311,8 @@ cdef class PairwiseDistancesReduction: for Y_chunk_idx in range(self.Y_n_chunks): Y_start = Y_chunk_idx * self.Y_n_samples_chunk - if (Y_chunk_idx == self.Y_n_chunks - 1 - and self.Y_n_samples_remainder > 0): - Y_end = Y_start + self.Y_n_samples_remainder + if Y_chunk_idx == self.Y_n_chunks - 1: + Y_end = Y_start + self.Y_n_samples_last_chunk else: Y_end = Y_start + self.Y_n_samples_chunk @@ -351,8 +357,8 @@ cdef class PairwiseDistancesReduction: for X_chunk_idx in range(self.X_n_chunks): X_start = X_chunk_idx * self.X_n_samples_chunk - if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_remainder > 0: - X_end = X_start + self.X_n_samples_remainder + if X_chunk_idx == self.X_n_chunks - 1: + X_end = X_start + self.X_n_samples_last_chunk else: X_end = X_start + self.X_n_samples_chunk @@ -364,9 +370,8 @@ cdef class PairwiseDistancesReduction: for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'): Y_start = Y_chunk_idx * self.Y_n_samples_chunk - if Y_chunk_idx == self.Y_n_chunks - 1 \ - and self.Y_n_samples_remainder > 0: - Y_end = Y_start + self.Y_n_samples_remainder + if Y_chunk_idx == self.Y_n_chunks - 1: + Y_end = Y_start + self.Y_n_samples_last_chunk else: Y_end = Y_start + self.Y_n_samples_chunk From e9acef73941b9d42ac999b17e73a6f9d1135c762 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 3 Jan 2022 15:55:27 +0100 Subject: [PATCH 09/22] Improve comments regarding strategies and parallel sections Co-authored-by: Olivier Grisel --- .../metrics/_pairwise_distances_reduction.pyx | 51 ++++++++++++------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index f2d6715e81911..86dee15dd9cc7 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -123,19 +123,27 @@ cdef class PairwiseDistancesReduction: strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None The chunking strategy defining which dataset parallelization are made on. - Strategies differs on the dispatching they use for chunks on threads: + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: - 'parallel_on_X' dispatches chunks of X uniformly on threads. Each thread then iterates on all the chunks of Y. This strategy is embarrassingly parallel and comes with no datastructures synchronisation. - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. - Each thread then iterates on all the chunks of X. This strategy is - embarrassingly parallel but uses intermediate datastructures - synchronisation. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. - 'auto' relies on a simple heuristic to choose between - 'parallel_on_X' and 'parallel_on_Y'. + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` + is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity + for parallelism and is therefore more efficient despite the synchronization + step at each iteration of the outer loop on chunks of `X`. - None (default) looks-up in scikit-learn configuration for `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. @@ -278,7 +286,8 @@ cdef class PairwiseDistancesReduction: @final cdef void _parallel_on_X(self) nogil: """Compute the pairwise distances of each row vector of X on Y - by parallelizing computation on chunks of X and reduce them. + by parallelizing computation on the outer loop on chunks of X + and reduce them. This strategy dispatches chunks of X uniformly on threads. Each thread then iterates on all the chunks of Y. This strategy is @@ -336,7 +345,8 @@ cdef class PairwiseDistancesReduction: @final cdef void _parallel_on_Y(self) nogil: """Compute the pairwise distances of each row vector of X on Y - by parallelizing computation on chunks of Y and reduce them. + by parallelizing computation on the inner loop on chunks of Y + and reduce them. This strategy dispatches chunks of Y uniformly on threads. Each thread then iterates on all the chunks of X. This strategy is @@ -352,7 +362,7 @@ cdef class PairwiseDistancesReduction: ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx ITYPE_t thread_num - # Allocating datastructures + # Allocating datastructures shared by all threads self._parallel_on_Y_parallel_init() for X_chunk_idx in range(self.X_n_chunks): @@ -659,7 +669,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): sizeof(ITYPE_t *) * self.chunks_n_threads ) - # Main heaps used by PairwiseDistancesArgKmin._compute to return results. + # Main heaps which will be returned as results by `PairwiseDistancesArgKmin.compute`. self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE) self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE) @@ -685,8 +695,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num] ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num] - # Pushing the distance and their associated indices on heaps - # which keep tracks of the argkmin. + # Pushing the distances and their associated indices on a heap + # which by construction will keep track of the argkmin. for i in range(n_samples_X): for j in range(n_samples_Y): heap_push( @@ -718,7 +728,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): cdef: ITYPE_t idx, jdx - # Sorting indices of the argkmin for each query vector of X + # Sorting the main heaps portion associated to `X[X_start:X_end]` + # in ascending order w.r.t the distances. for idx in range(X_end - X_start): simultaneous_sort( self.heaps_r_distances_chunks[thread_num] + idx * self.k, @@ -736,7 +747,9 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): # The allocation is done in parallel for data locality purposes: this way # the heaps used in each threads are allocated in pages which are closer - # to processor core used by the thread. + # to the CPU core used by the thread. + # See comments about First Touch Placement Policy: + # https://www.openmp.org/wp-content/uploads/openmp-webinar-vanderPas-20210318.pdf #noqa for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True, num_threads=self.chunks_n_threads): # As chunks of X are shared across threads, so must their @@ -770,9 +783,11 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): with nogil, parallel(num_threads=self.effective_n_threads): # Synchronising the thread heaps with the main heaps. # This is done in parallel sample-wise (no need for locks). - # This might break each thread's data locality a bit but - # but this is negligible and this parallel pattern has - # shown to be efficient in practice. + # + # This might break each thread's data locality as each heap which + # was allocated in a thread is being now being used in several threads. + # + # Still, this parallel pattern has shown to be efficient in practice. for idx in prange(X_end - X_start, schedule="static"): for thread_num in range(self.chunks_n_threads): for jdx in range(self.k): @@ -796,8 +811,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): free(self.heaps_r_distances_chunks[thread_num]) free(self.heaps_indices_chunks[thread_num]) - # Sort the main heaps into arrays in parallel - # in ascending order w.r.t the distances + # Sorting the main in ascending order w.r.t the distances. + # This is done in parallel sample-wise (no need for locks). for idx in prange(self.n_samples_X, schedule='static'): simultaneous_sort( &self.argkmin_distances[idx, 0], From 51dad2b2065d73fded8af7cb8d88265527564249 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 4 Jan 2022 10:53:26 +0100 Subject: [PATCH 10/22] Address reviews' comments Co-authored-by: Thomas J. Fan Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx | 14 +++++---- .../metrics/_pairwise_distances_reduction.pyx | 30 +++++++++---------- .../test_pairwise_distances_reduction.py | 19 ++++++------ 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index 6d090bdebafe5..6cf93baeca925 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -1222,10 +1222,10 @@ cdef class DatasetsPair: If provided as a sparse matrix, it must be in CSR format. metric : str, default='euclidean' - The distance metric to use for argkmin. The default metric is - a fast implementation of the standard Euclidean metric. - For a list of available metrics, see the documentation of - :class:`~sklearn.metrics.DistanceMetric`. + The distance metric to compute between rows of X and Y. + The default metric is a fast implementation of the Euclidean + metric. For a list of available metrics, see the documentation + of :class:`~sklearn.metrics.DistanceMetric`. metric_kwargs : dict, default=None Keyword arguments to pass to specified metric function. @@ -1242,12 +1242,16 @@ cdef class DatasetsPair: ) if not(X.dtype == Y.dtype == np.float64): - raise ValueError("Only 64bit float datasets are supported for X and Y.") + raise ValueError( + f"Only 64bit float datasets are supported at this time, " + f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}" + ) # Metric-specific checks that do not replace nor duplicate `check_array`. distance_metric._validate_data(X) distance_metric._validate_data(Y) + # TODO: dispatch to other dataset pairs for sparse support once available: if issparse(X) or issparse(Y): raise ValueError("Only dense datasets are supported for X and Y.") diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 86dee15dd9cc7..3ae51e0ee00ab 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -181,7 +181,7 @@ cdef class PairwiseDistancesReduction: "pyfunc", # is relatively slow because we need to coerce data as np arrays "mahalanobis", # is numerically unstable # TODO: In order to support discrete distance metrics, we need to have a - # simultaneous sort which breaks ties on indices when distances are identical. + # stable simultaneous sort which preserves the order of the input. # The best might be using std::stable_sort and a Comparator taking an # Arrays of Structures instead of Structure of Arrays (currently used). "hamming", @@ -210,13 +210,9 @@ cdef class PairwiseDistancesReduction: ------- True if the PairwiseDistancesReduction can be used, else False. """ - # Coercing to np.array to get the dtype - # TODO: what is the best way to get lists' dtype? - X = np.asarray(X) if not isinstance(X, (np.ndarray, scipy.sparse.spmatrix)) else X - Y = np.asarray(Y) if not isinstance(Y, (np.ndarray, scipy.sparse.spmatrix)) else Y # TODO: support sparse arrays and 32 bits - return (not issparse(X) and X.dtype == np.float64 and X.ndim == 2 and - not issparse(Y) and Y.dtype == np.float64 and Y.ndim == 2 and + return (not issparse(X) and X.dtype == np.float64 and + not issparse(Y) and Y.dtype == np.float64 and metric in cls.valid_metrics()) def __init__( @@ -289,9 +285,11 @@ cdef class PairwiseDistancesReduction: by parallelizing computation on the outer loop on chunks of X and reduce them. - This strategy dispatches chunks of X uniformly on threads. - Each thread then iterates on all the chunks of Y. This strategy is - embarrassingly parallel and comes with no datastructures synchronisation. + This strategy dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. Private datastructures are modified internally by threads. @@ -363,7 +361,7 @@ cdef class PairwiseDistancesReduction: ITYPE_t thread_num # Allocating datastructures shared by all threads - self._parallel_on_Y_parallel_init() + self._parallel_on_Y_init() for X_chunk_idx in range(self.X_n_chunks): X_start = X_chunk_idx * self.X_n_samples_chunk @@ -376,7 +374,7 @@ cdef class PairwiseDistancesReduction: thread_num = _openmp_thread_num() # Initializing datastructures used in this thread - self._parallel_on_Y_init(thread_num) + self._parallel_on_Y_parallel_init(thread_num) for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'): Y_start = Y_chunk_idx * self.Y_n_samples_chunk @@ -466,13 +464,13 @@ cdef class PairwiseDistancesReduction: """Interact with datastructures after executing all the reductions.""" return - cdef void _parallel_on_Y_parallel_init( + cdef void _parallel_on_Y_init( self, ) nogil: """Allocate datastructures used in all threads.""" return - cdef void _parallel_on_Y_init( + cdef void _parallel_on_Y_parallel_init( self, ITYPE_t thread_num, ) nogil: @@ -737,7 +735,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): self.k ) - cdef void _parallel_on_Y_parallel_init( + cdef void _parallel_on_Y_init( self, ) nogil: cdef: @@ -763,7 +761,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): ) @final - cdef void _parallel_on_Y_init( + cdef void _parallel_on_Y_parallel_init( self, ITYPE_t thread_num, ) nogil: diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 844f8e7138d20..104087b8cef9e 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -76,9 +76,6 @@ def test_pairwise_distances_reduction_is_usable_for(): X.astype(np.int64), Y.astype(np.int64), metric ) - assert not PairwiseDistancesReduction.is_usable_for(X[0], Y, metric) - assert not PairwiseDistancesReduction.is_usable_for(X, Y[0], metric) - assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc") # TODO: remove once 32 bits datasets are supported assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric) @@ -96,16 +93,20 @@ def test_argkmin_factory_method_wrong_usages(): k = 5 metric = "euclidean" - with pytest.raises( - ValueError, match="Only 64bit float datasets are supported for X and Y." - ): + msg = ( + "Only 64bit float datasets are supported at this time, " + "got: X.dtype=float32 and Y.dtype=float64" + ) + with pytest.raises(ValueError, match=msg): PairwiseDistancesArgKmin.compute( X=X.astype(np.float32), Y=Y, k=k, metric=metric ) - with pytest.raises( - ValueError, match="Only 64bit float datasets are supported for X and Y." - ): + msg = ( + "Only 64bit float datasets are supported at this time, " + "got: X.dtype=float64 and Y.dtype=int32" + ) + with pytest.raises(ValueError, match=msg): PairwiseDistancesArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric) with pytest.raises(ValueError, match="k == -1, must be >= 1."): From 59b153cf5c60ed31715e47976591c603bb2189b7 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 4 Jan 2022 10:54:08 +0100 Subject: [PATCH 11/22] Remove unused _sqeuclidean_row_norms Co-authored-by: Thomas J. Fan --- .../metrics/_pairwise_distances_reduction.pyx | 25 ------------------- .../test_pairwise_distances_reduction.py | 22 ---------------- 2 files changed, 47 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 3ae51e0ee00ab..967c62b46546d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -51,31 +51,6 @@ from ..utils._typedefs import ITYPE, DTYPE np.import_array() - -cpdef DTYPE_t[::1] _sqeuclidean_row_norms( - const DTYPE_t[:, ::1] X, - ITYPE_t num_threads, -): - """Compute the squared euclidean norm of the rows of X in parallel. - - This is faster than using np.einsum("ij, ij->i") even when using a single thread. - """ - cdef: - # Casting for X to remove the const qualifier is needed because APIs - # exposed via scipy.linalg.cython_blas aren't reflecting the arguments' - # const qualifier. - # See: https://github.com/scipy/scipy/issues/1426 - DTYPE_t * X_ptr = &X[0, 0] - ITYPE_t idx = 0 - ITYPE_t n = X.shape[0] - ITYPE_t d = X.shape[1] - DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE) - - for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): - squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1) - - return squared_row_norms - cdef class PairwiseDistancesReduction: """Abstract base class for pairwise distance computation & reduction. diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 104087b8cef9e..18aba2acd79c8 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -6,7 +6,6 @@ from sklearn.metrics._pairwise_distances_reduction import ( PairwiseDistancesReduction, PairwiseDistancesArgKmin, - _sqeuclidean_row_norms, ) from sklearn.utils.fixes import sp_version, parse_version @@ -340,24 +339,3 @@ def test_euclidean_translation_invariance( ASSERT_RESULT[PairwiseDistancesReduction]( reference_dist, dist, reference_indices, indices ) - - -@pytest.mark.parametrize("seed", range(10)) -@pytest.mark.parametrize("n_samples", [100, 1000]) -@pytest.mark.parametrize("n_features", [5, 10, 100]) -@pytest.mark.parametrize("num_threads", [1, 2, 8]) -def test_sqeuclidean_row_norms( - seed, - n_samples, - n_features, - num_threads, - dtype=np.float64, -): - rng = np.random.RandomState(seed) - spread = 100 - X = rng.rand(n_samples, n_features).astype(dtype) * spread - - sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2 - sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads)) - - assert_allclose(sq_row_norm_reference, sq_row_norm) From 395f92a6ecec471e7f16bd127d01561fd7fb85f1 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 4 Jan 2022 11:06:39 +0100 Subject: [PATCH 12/22] Swap argkmin_indices and argkmin_distances To have argkmin_indices always be the first for consistency. Co-authored-by: Olivier Grisel --- .../metrics/_pairwise_distances_reduction.pyx | 13 ++++++++++--- .../tests/test_pairwise_distances_reduction.py | 16 ++++++++-------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 967c62b46546d..ce247971eabcd 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -575,8 +575,15 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): Returns ------- - Indices of argkmin for each vector in X and its associated distances - if return_distance=True. + If return_distance=False: + - argkmin_indices : ndarray of shape (n_samples_X, k) + Indices of the argkmin for each vector in X. + + If return_distance=True: + - argkmin_indices : ndarray of shape (n_samples_X, k) + Indices of the argkmin for each vector in X. + - argkmin_distances : ndarray of shape (n_samples_X, k) + Distances to the argkmin for each vector in X. Notes ----- @@ -812,6 +819,6 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): # We need to recompute distances because we relied on # surrogate distances for the reduction. self.compute_exact_distances() - return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices) + return np.asarray(self.argkmin_indices), np.asarray(self.argkmin_distances) return np.asarray(self.argkmin_indices) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 18aba2acd79c8..33c746e0ff15c 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -158,14 +158,14 @@ def test_chunk_size_agnosticism( else 10 ** np.log(n_features) ) - ref_dist, ref_indices = PairwiseDistancesReduction.compute( + ref_indices, ref_dist = PairwiseDistancesReduction.compute( X, Y, parameter, return_distance=True, ) - dist, indices = PairwiseDistancesReduction.compute( + indices, dist = PairwiseDistancesReduction.compute( X, Y, parameter, @@ -204,14 +204,14 @@ def test_n_threads_agnosticism( else 10 ** np.log(n_features) ) - ref_dist, ref_indices = PairwiseDistancesReduction.compute( + ref_indices, ref_dist = PairwiseDistancesReduction.compute( X, Y, parameter, return_distance=True, ) - dist, indices = PairwiseDistancesReduction.compute( + indices, dist = PairwiseDistancesReduction.compute( X, Y, parameter, n_threads=1, return_distance=True ) @@ -251,7 +251,7 @@ def test_strategies_consistency( else 10 ** np.log(n_features) ) - dist_par_X, indices_par_X = PairwiseDistancesReduction.compute( + indices_par_X, dist_par_X = PairwiseDistancesReduction.compute( X, Y, parameter, @@ -264,7 +264,7 @@ def test_strategies_consistency( return_distance=True, ) - dist_par_Y, indices_par_Y = PairwiseDistancesReduction.compute( + indices_par_Y, dist_par_Y = PairwiseDistancesReduction.compute( X, Y, parameter, @@ -318,7 +318,7 @@ def test_euclidean_translation_invariance( X = np.ascontiguousarray(X[:, :2]) Y = np.ascontiguousarray(Y[:, :2]) - reference_dist, reference_indices = PairwiseDistancesReduction.compute( + reference_indices, reference_dist = PairwiseDistancesReduction.compute( X, Y, parameter, @@ -327,7 +327,7 @@ def test_euclidean_translation_invariance( return_distance=True, ) - dist, indices = PairwiseDistancesReduction.compute( + indices, dist = PairwiseDistancesReduction.compute( X + 0, Y + 0, parameter, From 09a95272b4bd5b02692e8fde8ff26959f9814cae Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 4 Jan 2022 11:23:10 +0100 Subject: [PATCH 13/22] Move initializations from __init__ to __cinit__ This is more appropriate, especially for dynamic allocation. See: cython.readthedocs.io/en/latest/src/userguide/special_methods.html#initialisation-methods-cinit-and-init The __cinit__() method is where you should perform basic C-level initialisation of the object, including allocation of any C data structures that your object will own. Co-authored-by: Thomas J. Fan --- .../metrics/_pairwise_distances_reduction.pyx | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index ce247971eabcd..fe59d141f8ff7 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -190,12 +190,14 @@ cdef class PairwiseDistancesReduction: not issparse(Y) and Y.dtype == np.float64 and metric in cls.valid_metrics()) - def __init__( + def __cinit__( self, DatasetsPair datasets_pair, chunk_size=None, n_threads=None, strategy=None, + *args, + **kwargs, ): cdef: ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks @@ -474,9 +476,6 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): datasets_pair: DatasetsPair The dataset pairs (X, Y) for the reduction. - k: int - The k for the argkmin reduction. - chunk_size: int, default=None, The number of vectors per chunk. If None (default) looks-up in scikit-learn configuration for `pairwise_dist_chunk_size`, @@ -485,10 +484,14 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): n_threads: int, default=None The number of OpenMP threads to use for the reduction. Parallelism is done on chunks and the sharding of chunks - depends on the `strategy` set on :method:`~ArgKmin.compute`. + depends on the `strategy` set on + :meth:`~PairwiseDistancesArgKmin.compute`. See _openmp_effective_n_threads, for details about the specification of n_threads. + + k: int, default=1 + The k for the argkmin reduction. """ cdef: @@ -544,7 +547,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): The number of OpenMP threads to use for the reduction. Parallelism is done on chunks and the sharding of chunks depends on the `strategy` set on - :method:`~PairwiseDistancesArgKmin.compute`. + :meth:`~PairwiseDistancesArgKmin.compute`. See _openmp_effective_n_threads, for details about the specification of n_threads. @@ -621,25 +624,22 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): return pda._finalize_results(return_distance) - def __init__( + def __cinit__( self, DatasetsPair datasets_pair, - ITYPE_t k, chunk_size=None, n_threads=None, strategy=None, - ): - super().__init__(datasets_pair, chunk_size, n_threads, strategy) - - self.k = check_scalar(k, "k", Integral, min_val=1) - + *args, + **kwargs, + ): # Allocating pointers to datastructures but not the datastructures themselves. # There are as many pointers as effective threads. # # For the sake of explicitness: - # - when parallelizing on X, those heaps pointers are referencing + # - when parallelizing on X, the pointers of those heaps are referencing # (with proper offsets) addresses of the two main heaps (see bellow) - # - when parallelizing on Y, those heaps pointer heaps are referencing + # - when parallelizing on Y, the pointers of those heaps are referencing # small heaps which are thread-wise-allocated and whose content will be # merged with the main heaps'. self.heaps_r_distances_chunks = malloc( @@ -649,6 +649,16 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): sizeof(ITYPE_t *) * self.chunks_n_threads ) + def __init__( + self, + DatasetsPair datasets_pair, + chunk_size=None, + n_threads=None, + strategy=None, + ITYPE_t k=1, + ): + self.k = check_scalar(k, "k", Integral, min_val=1) + # Main heaps which will be returned as results by `PairwiseDistancesArgKmin.compute`. self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE) self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE) @@ -763,10 +773,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): with nogil, parallel(num_threads=self.effective_n_threads): # Synchronising the thread heaps with the main heaps. # This is done in parallel sample-wise (no need for locks). - # + # # This might break each thread's data locality as each heap which # was allocated in a thread is being now being used in several threads. - # + # # Still, this parallel pattern has shown to be efficient in practice. for idx in prange(X_end - X_start, schedule="static"): for thread_num in range(self.chunks_n_threads): @@ -792,7 +802,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): free(self.heaps_indices_chunks[thread_num]) # Sorting the main in ascending order w.r.t the distances. - # This is done in parallel sample-wise (no need for locks). + # This is done in parallel sample-wise (no need for locks). for idx in prange(self.n_samples_X, schedule='static'): simultaneous_sort( &self.argkmin_distances[idx, 0], From f396a585ea67b2d99effa0b5b5a168b2b7e61172 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 5 Jan 2022 08:30:44 +0100 Subject: [PATCH 14/22] Improve docstring comment Co-authored-by: Olivier Grisel --- sklearn/metrics/_pairwise_distances_reduction.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index fe59d141f8ff7..35c9095c57534 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -324,9 +324,10 @@ cdef class PairwiseDistancesReduction: and reduce them. This strategy dispatches chunks of Y uniformly on threads. - Each thread then iterates on all the chunks of X. This strategy is - embarrassingly parallel but uses intermediate datastructures - synchronisation. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. Private datastructures are modified internally by threads. From 22f4f30e1f8c5d2bdd6af2333bde8da6ee548477 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 5 Jan 2022 09:04:40 +0100 Subject: [PATCH 15/22] Improve comments Co-authored-by: Olivier Grisel --- .../metrics/_pairwise_distances_reduction.pyx | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 35c9095c57534..099c39df77604 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -24,22 +24,12 @@ from libc.float cimport DBL_MAX from cython cimport final from cython.parallel cimport parallel, prange -from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair -from ..utils._cython_blas cimport ( - BLAS_Order, - BLAS_Trans, - ColMajor, - NoTrans, - RowMajor, - Trans, - _dot, - _gemm, -) +from ._dist_metrics cimport DatasetsPair from ..utils._heap cimport simultaneous_sort, heap_push from ..utils._openmp_helpers cimport _openmp_thread_num from ..utils._typedefs cimport ITYPE_t, DTYPE_t -from numbers import Integral, Real +from numbers import Integral from typing import List from scipy.sparse import issparse from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING @@ -472,6 +462,12 @@ cdef class PairwiseDistancesReduction: cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): """Compute the argkmin of row vectors of X on the ones of Y. + For each row vector of X, computes the indices of k first the rows + vectors of Y with the smallest distances. + + PairwiseDistancesArgKmin is typically used to perform + bruteforce k-nearest neighbors queries. + Parameters ---------- datasets_pair: DatasetsPair @@ -556,19 +552,27 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None The chunking strategy defining which dataset parallelization are made on. - Strategies differs on the dispatching they use for chunks on threads: + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: - - 'parallel_on__X' dispatches chunks of X uniformly on threads. + - 'parallel_on_X' dispatches chunks of X uniformly on threads. Each thread then iterates on all the chunks of Y. This strategy is embarrassingly parallel and comes with no datastructures synchronisation. - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. - Each thread then iterates on all the chunks of X. This strategy is - embarrassingly parallel but uses intermediate datastructures - synchronisation. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. - 'auto' relies on a simple heuristic to choose between - 'parallel_on__X' and 'parallel_on_Y'. + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` + is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity + for parallelism and is therefore more efficient despite the synchronization + step at each iteration of the outer loop on chunks of `X`. - None (default) looks-up in scikit-learn configuration for `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. From 234fb01151bb60732023b09dd8d3bdc3ed5ce863 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 5 Jan 2022 09:15:37 +0100 Subject: [PATCH 16/22] Add 'pairwise_dist_chunk_size' to scikit-learn config --- sklearn/_config.py | 30 ++++++++++++++++++++++++++++-- sklearn/tests/test_config.py | 3 +++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/sklearn/_config.py b/sklearn/_config.py index c41c180012056..d6a02737f640d 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -9,6 +9,9 @@ "working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)), "print_changed_only": True, "display": "text", + "pairwise_dist_chunk_size": int( + os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256) + ), } _threadlocal = threading.local() @@ -40,7 +43,11 @@ def get_config(): def set_config( - assume_finite=None, working_memory=None, print_changed_only=None, display=None + assume_finite=None, + working_memory=None, + print_changed_only=None, + display=None, + pairwise_dist_chunk_size=None, ): """Set global scikit-learn configuration @@ -80,6 +87,12 @@ def set_config( .. versionadded:: 0.23 + pairwise_dist_chunk_size : int, default=None + The number of vectors per chunk for PairwiseDistancesReduction. + Default is 256 (suitable for most of modern laptops' caches and architectures). + + .. versionadded:: 1.1 + See Also -------- config_context : Context manager for global scikit-learn configuration. @@ -95,11 +108,18 @@ def set_config( local_config["print_changed_only"] = print_changed_only if display is not None: local_config["display"] = display + if pairwise_dist_chunk_size is not None: + local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size @contextmanager def config_context( - *, assume_finite=None, working_memory=None, print_changed_only=None, display=None + *, + assume_finite=None, + working_memory=None, + print_changed_only=None, + display=None, + pairwise_dist_chunk_size=None, ): """Context manager for global scikit-learn configuration. @@ -138,6 +158,12 @@ def config_context( .. versionadded:: 0.23 + pairwise_dist_chunk_size : int, default=None + The number of vectors per chunk for PairwiseDistancesReduction. + Default is 256 (suitable for most of modern laptops' caches and architectures). + + .. versionadded:: 1.1 + Yields ------ None. diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index f78a9ff30b10a..e99eb5fc9db82 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -16,6 +16,7 @@ def test_config_context(): "working_memory": 1024, "print_changed_only": True, "display": "text", + "pairwise_dist_chunk_size": 256, } # Not using as a context manager affects nothing @@ -28,6 +29,7 @@ def test_config_context(): "working_memory": 1024, "print_changed_only": True, "display": "text", + "pairwise_dist_chunk_size": 256, } assert get_config()["assume_finite"] is False @@ -57,6 +59,7 @@ def test_config_context(): "working_memory": 1024, "print_changed_only": True, "display": "text", + "pairwise_dist_chunk_size": 256, } # No positional arguments From f89c65d4795e7d884eef27a1876d547a2b5b6f5b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 5 Jan 2022 10:03:34 +0100 Subject: [PATCH 17/22] TST Adapt test for PairwiseDistancesArgKmin translation invariance Co-authored-by: Olivier Grisel --- .../test_pairwise_distances_reduction.py | 89 ++++++++++++------- 1 file changed, 56 insertions(+), 33 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 33c746e0ff15c..2982bc32b06d1 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -2,6 +2,7 @@ import pytest from numpy.testing import assert_array_equal, assert_allclose from scipy.sparse import csr_matrix +from scipy.spatial.distance import cdist from sklearn.metrics._pairwise_distances_reduction import ( PairwiseDistancesReduction, @@ -10,6 +11,22 @@ from sklearn.utils.fixes import sp_version, parse_version +# Common supported metric between scipy.spatial.distance.cdist +# and PairwiseDistancesReduction. +# This allows constructing tests to check consistency of results +# of concrete PairwiseDistancesReduction on some metrics using APIs +# from scipy and numpy. +CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS = [ + "braycurtis", + "canberra", + "chebyshev", + "cityblock", + "euclidean", + "minkowski", + "seuclidean", + "wminkowski", +] + def _get_dummy_metric_params_list(metric: str, n_features: int): """Return list of dummy DistanceMetric kwargs for tests.""" @@ -285,57 +302,63 @@ def test_strategies_consistency( ) +# Concrete PairwiseDistancesReductions tests + + @pytest.mark.parametrize("n_features", [50, 500]) -@pytest.mark.parametrize("translation", [10 ** i for i in [4, 8]]) -@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics()) +@pytest.mark.parametrize("translation", [10 ** i for i in [2, 4, 8]]) +@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS) +@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y")) @pytest.mark.parametrize( "PairwiseDistancesReduction", [PairwiseDistancesArgKmin], ) -def test_euclidean_translation_invariance( +def test_argkmin_translation_invariance( n_features, translation, metric, - PairwiseDistancesReduction, - n_samples=1000, + strategy, + n_samples=100, + k=10, dtype=np.float64, ): - # The reduction must be translation invariant. - parameter = ( - 10 - if PairwiseDistancesReduction is PairwiseDistancesArgKmin - # Scaling the radius slightly with the numbers of dimensions - else 10 ** np.log(n_features) - ) + # PairwiseDistancesArgKmin must be translation invariant. rng = np.random.RandomState(0) - spread = 100 - X = rng.rand(n_samples, n_features).astype(dtype) * spread - Y = rng.rand(n_samples, n_features).astype(dtype) * spread + spread = 1000 + X_translated = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + Y_translated = translation + rng.rand(n_samples, n_features).astype(dtype) * spread # Haversine distance only accepts 2D data if metric == "haversine": - X = np.ascontiguousarray(X[:, :2]) - Y = np.ascontiguousarray(Y[:, :2]) - - reference_indices, reference_dist = PairwiseDistancesReduction.compute( - X, - Y, - parameter, - metric=metric, - metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0], - return_distance=True, - ) - - indices, dist = PairwiseDistancesReduction.compute( - X + 0, - Y + 0, - parameter, + X_translated = np.ascontiguousarray(X_translated[:, :2]) + Y_translated = np.ascontiguousarray(Y_translated[:, :2]) + + metric_kwargs = _get_dummy_metric_params_list(metric, n_features)[0] + + # Reference for argkmin results + dist_matrix = cdist(X_translated, Y_translated, metric=metric, **metric_kwargs) + # Taking argkmin (indices of the k smallest values) + argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k] + # Getting the associated distances + argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float) + for row_idx in range(argkmin_indices_ref.shape[0]): + argkmin_distances_ref[row_idx] = dist_matrix[ + row_idx, argkmin_indices_ref[row_idx] + ] + + argkmin_indices, argkmin_distances = PairwiseDistancesReduction.compute( + X_translated, + Y_translated, + k, metric=metric, - metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0], + metric_kwargs=metric_kwargs, return_distance=True, + # So as to have more than a chunk, forcing parallelism. + chunk_size=n_samples // 4, + strategy=strategy, ) ASSERT_RESULT[PairwiseDistancesReduction]( - reference_dist, dist, reference_indices, indices + argkmin_distances, argkmin_distances_ref, argkmin_indices, argkmin_indices_ref ) From fe17af1dca9ddbd2cf2a18cf5d24b3c57001ad6c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 5 Jan 2022 10:19:09 +0100 Subject: [PATCH 18/22] test_pairwise_distances_argkmin --- .../tests/test_pairwise_distances_reduction.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 2982bc32b06d1..eec8838e2f20d 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -306,14 +306,10 @@ def test_strategies_consistency( @pytest.mark.parametrize("n_features", [50, 500]) -@pytest.mark.parametrize("translation", [10 ** i for i in [2, 4, 8]]) +@pytest.mark.parametrize("translation", [0, 1e8]) @pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS) @pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y")) -@pytest.mark.parametrize( - "PairwiseDistancesReduction", - [PairwiseDistancesArgKmin], -) -def test_argkmin_translation_invariance( +def test_pairwise_distances_argkmin( n_features, translation, metric, @@ -322,8 +318,6 @@ def test_argkmin_translation_invariance( k=10, dtype=np.float64, ): - # PairwiseDistancesArgKmin must be translation invariant. - rng = np.random.RandomState(0) spread = 1000 X_translated = translation + rng.rand(n_samples, n_features).astype(dtype) * spread @@ -347,7 +341,7 @@ def test_argkmin_translation_invariance( row_idx, argkmin_indices_ref[row_idx] ] - argkmin_indices, argkmin_distances = PairwiseDistancesReduction.compute( + argkmin_indices, argkmin_distances = PairwiseDistancesArgKmin.compute( X_translated, Y_translated, k, @@ -359,6 +353,6 @@ def test_argkmin_translation_invariance( strategy=strategy, ) - ASSERT_RESULT[PairwiseDistancesReduction]( + ASSERT_RESULT[PairwiseDistancesArgKmin]( argkmin_distances, argkmin_distances_ref, argkmin_indices, argkmin_indices_ref ) From 38715d2e5a06adef79fdf1e12de44555f9d4ae7d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 5 Jan 2022 10:24:36 +0100 Subject: [PATCH 19/22] Simpler variable names --- .../tests/test_pairwise_distances_reduction.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index eec8838e2f20d..c59d2f765eadb 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -320,18 +320,18 @@ def test_pairwise_distances_argkmin( ): rng = np.random.RandomState(0) spread = 1000 - X_translated = translation + rng.rand(n_samples, n_features).astype(dtype) * spread - Y_translated = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread # Haversine distance only accepts 2D data if metric == "haversine": - X_translated = np.ascontiguousarray(X_translated[:, :2]) - Y_translated = np.ascontiguousarray(Y_translated[:, :2]) + X = np.ascontiguousarray(X[:, :2]) + Y = np.ascontiguousarray(Y[:, :2]) metric_kwargs = _get_dummy_metric_params_list(metric, n_features)[0] # Reference for argkmin results - dist_matrix = cdist(X_translated, Y_translated, metric=metric, **metric_kwargs) + dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs) # Taking argkmin (indices of the k smallest values) argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k] # Getting the associated distances @@ -342,8 +342,8 @@ def test_pairwise_distances_argkmin( ] argkmin_indices, argkmin_distances = PairwiseDistancesArgKmin.compute( - X_translated, - Y_translated, + X, + Y, k, metric=metric, metric_kwargs=metric_kwargs, From ce986d5310dddadebcf9121d46cdee7185f6416c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 5 Jan 2022 13:33:30 +0100 Subject: [PATCH 20/22] fixup! TST Adapt test for PairwiseDistancesArgKmin translation invariance --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index c59d2f765eadb..06439ac7e180a 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -24,9 +24,12 @@ "euclidean", "minkowski", "seuclidean", - "wminkowski", ] +# TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0. +if sp_version < parse_version("1.8.0.dev0"): + CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS.append("wminkowski") + def _get_dummy_metric_params_list(metric: str, n_features: int): """Return list of dummy DistanceMetric kwargs for tests.""" From 70a28b7b6f19a76acf4634c1b33cb789b5440788 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 5 Jan 2022 14:28:48 +0100 Subject: [PATCH 21/22] fixup! fixup! TST Adapt test for PairwiseDistancesArgKmin translation invariance --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 06439ac7e180a..a4d51e4662740 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -26,10 +26,6 @@ "seuclidean", ] -# TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0. -if sp_version < parse_version("1.8.0.dev0"): - CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS.append("wminkowski") - def _get_dummy_metric_params_list(metric: str, n_features: int): """Return list of dummy DistanceMetric kwargs for tests.""" @@ -338,7 +334,7 @@ def test_pairwise_distances_argkmin( # Taking argkmin (indices of the k smallest values) argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k] # Getting the associated distances - argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float) + argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64) for row_idx in range(argkmin_indices_ref.shape[0]): argkmin_distances_ref[row_idx] = dist_matrix[ row_idx, argkmin_indices_ref[row_idx] From 06ca86936e52f5fe74f8d4791063d011d065da30 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 5 Jan 2022 14:42:27 +0100 Subject: [PATCH 22/22] Use correct orthograph for 'Callback' Co-authored-by: Christian Lorentzen --- sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 099c39df77604..830df08e1a952 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -389,7 +389,7 @@ cdef class PairwiseDistancesReduction: return def _finalize_results(self, bint return_distance): - """Call-back adapting datastructures before returning results. + """Callback adapting datastructures before returning results. This must be implemented in subclasses. """