diff --git a/.gitignore b/.gitignore index f4125316d7d41..967ce97dc38ad 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,4 @@ sklearn/utils/_weight_vector.pxd sklearn/linear_model/_sag_fast.pyx sklearn/metrics/_dist_metrics.pyx sklearn/metrics/_dist_metrics.pxd +sklearn/metrics/_pairwise_distances_reduction.pyx diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 4c46c0d631f76..8b5e106e5206a 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -186,6 +186,9 @@ Changelog - |Efficiency| Low-level routines for reductions on pairwise distances + for dense float32 and float64 datasets have been refactored. + The following functions and estimators now benefit from improved performances, + in particular on multi-cores machines: for dense float64 datasets have been refactored. The following functions and estimators now benefit from improved performances in terms of hardware scalability and speed-ups: diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 32ba546672c6e..8fb2f602dc086 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -100,12 +100,11 @@ cdef class DistanceMetric{{name_suffix}}: cdef DTYPE_t _dist_to_rdist(self, {{DTYPE_t}} dist) nogil except -1 -{{endfor}} ###################################################################### # DatasetsPair base class -cdef class DatasetsPair: - cdef DistanceMetric distance_metric +cdef class DatasetsPair{{name_suffix}}: + cdef DistanceMetric{{name_suffix}} distance_metric cdef ITYPE_t n_samples_X(self) nogil @@ -116,8 +115,10 @@ cdef class DatasetsPair: cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil -cdef class DenseDenseDatasetsPair(DatasetsPair): +cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef: - const DTYPE_t[:, ::1] X - const DTYPE_t[:, ::1] Y + const {{DTYPE_t}}[:, ::1] X + const {{DTYPE_t}}[:, ::1] Y ITYPE_t d + +{{endfor}} diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 5986fa939b45d..71644b251c42c 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -1170,11 +1170,10 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise TypeError("Custom distance function must accept two " "vectors and return a float.") -{{endfor}} ###################################################################### # Datasets Pair Classes -cdef class DatasetsPair: +cdef class DatasetsPair{{name_suffix}}: """Abstract class which wraps a pair of datasets (X, Y). This class allows computing distances between a single pair of rows of @@ -1211,7 +1210,7 @@ cdef class DatasetsPair: Y, str metric="euclidean", dict metric_kwargs=None, - ) -> DatasetsPair: + ) -> DatasetsPair{{name_suffix}}: """Return the DatasetsPair implementation for the given arguments. Parameters @@ -1241,14 +1240,14 @@ cdef class DatasetsPair: The suited DatasetsPair implementation. """ cdef: - DistanceMetric distance_metric = DistanceMetric.get_metric( + DistanceMetric{{name_suffix}} distance_metric = DistanceMetric{{name_suffix}}.get_metric( metric, **(metric_kwargs or {}) ) - if not(X.dtype == Y.dtype == np.float64): + if not(X.dtype == Y.dtype and X.dtype in DatasetsPair{{name_suffix}}.valid_dtypes()): raise ValueError( - f"Only 64bit float datasets are supported at this time, " + f"Only np.float64 and np.float32 datasets are supported at this time, " f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." ) @@ -1260,11 +1259,15 @@ cdef class DatasetsPair: if issparse(X) or issparse(Y): raise ValueError("Only dense datasets are supported for X and Y.") - return DenseDenseDatasetsPair(X, Y, distance_metric) + return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric) - def __init__(self, DistanceMetric distance_metric): + def __init__(self, DistanceMetric{{name_suffix}} distance_metric): self.distance_metric = distance_metric + @classmethod + def valid_dtypes(cls): + return (np.float64, np.float32) + cdef ITYPE_t n_samples_X(self) nogil: """Number of samples in X.""" # This is a abstract method. @@ -1289,7 +1292,7 @@ cdef class DatasetsPair: return -1 @final -cdef class DenseDenseDatasetsPair(DatasetsPair): +cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): """Compute distances between row vectors of two arrays. Parameters @@ -1305,7 +1308,7 @@ cdef class DenseDenseDatasetsPair(DatasetsPair): between two row vectors of (X, Y). """ - def __init__(self, X, Y, DistanceMetric distance_metric): + def __init__(self, X, Y, DistanceMetric{{name_suffix}} distance_metric): super().__init__(distance_metric) # Arrays have already been checked self.X = X @@ -1331,3 +1334,5 @@ cdef class DenseDenseDatasetsPair(DatasetsPair): return self.distance_metric.dist(&self.X[i, 0], &self.Y[j, 0], self.d) + +{{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp similarity index 69% rename from sklearn/metrics/_pairwise_distances_reduction.pyx rename to sklearn/metrics/_pairwise_distances_reduction.pyx.tp index 9191efae2a8da..8193832f1b494 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx.tp @@ -1,3 +1,30 @@ +{{py: + +implementation_specific_values = [ + # Values are the following ones: + # + # name_suffix, bitness, DTYPE_t, DTYPE, need_upcast + # + # On the first hand, an empty string is use for `name_suffix` + # for the 64bit case as to still be able to expose the original + # 64bit implementation under the same API, namely `DistanceMetric`. + # + # On the other hand, '32' bit is use for `name_suffix` + # for the 32bit case to remove ambiguity and use `DistanceMetric32`, + # which is not publicly exposed. + # + # The metric mapping is adapted accordingly to route to the correct + # implementations. + # + # We also use 64bit types as defined in `sklearn.utils._typedefs` + # to maintain backward compatibility as the symbol level for extra + # safety. + # + ('', '64', 'DTYPE_t', 'DTYPE', False), + ('32', '32', 'cnp.float32_t', 'np.float32', True) +] + +}} # Pairwise Distances Reductions # ============================= # @@ -25,9 +52,7 @@ from libcpp.vector cimport vector from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange -from cpython.ref cimport Py_INCREF -from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair from ..utils._cython_blas cimport ( BLAS_Order, BLAS_Trans, @@ -53,7 +78,6 @@ from ..utils.fixes import threadpool_limits from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils._typedefs import ITYPE, DTYPE - cnp.import_array() # TODO: change for `libcpp.algorithm.move` once Cython 3 is used @@ -82,8 +106,7 @@ cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( """Coerce a std::vector of std::vector to a ndarray of ndarray.""" cdef: ITYPE_t n = deref(vecs).size() - cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, - dtype=np.ndarray) + cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray) for i in range(n): nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i])) @@ -117,7 +140,19 @@ cpdef DTYPE_t[::1] _sqeuclidean_row_norms( return squared_row_norms ##################### +# Interfaces: +# Those interfaces are meant to be used in the Python code, decoupling the +# actual implementation from the Python code. This allows changing all the +# private implementation while maintaining a contract for the Python callers. +# +# Each interface extending the base `PairwiseDistancesReduction` interface must +# implement the :meth:`compute` classmethod. +# +# Under the hood, such a function must only define the logic to dispatch +# at runtime to the correct dtype-specialized `PairwiseDistancesReduction` +# implementation based on the dtype of X and of Y. +# Base interface cdef class PairwiseDistancesReduction: """Abstract base class for pairwise distance computation & reduction. @@ -183,32 +218,6 @@ cdef class PairwiseDistancesReduction: `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. """ - cdef: - readonly DatasetsPair datasets_pair - - # The number of threads that can be used is stored in effective_n_threads. - # - # The number of threads to use in the parallelisation strategy - # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads: - # for small datasets, less threads might be needed to loop over pair of chunks. - # - # Hence the number of threads that _will_ be used for looping over chunks - # is stored in chunks_n_threads, allowing solely using what we need. - # - # Thus, an invariant is: - # - # chunks_n_threads <= effective_n_threads - # - ITYPE_t effective_n_threads - ITYPE_t chunks_n_threads - - ITYPE_t n_samples_chunk, chunk_size - - ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk - ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk - - bint execute_in_parallel_on_Y - @classmethod def valid_metrics(cls) -> List[str]: excluded = { @@ -223,6 +232,10 @@ cdef class PairwiseDistancesReduction: } return sorted(set(METRIC_MAPPING.keys()) - excluded) + @classmethod + def valid_dtypes(cls): + return (np.float32, np.float64) + @classmethod def is_usable_for(cls, X, Y, metric) -> bool: """Return True if the PairwiseDistancesReduction can be used for the given parameters. @@ -232,27 +245,406 @@ cdef class PairwiseDistancesReduction: X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) Input data. - Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) - Input data. + Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) + Input data. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + Returns + ------- + True if the PairwiseDistancesReduction can be used, else False. + """ + dtypes_validity = X.dtype == Y.dtype and Y.dtype in cls.valid_dtypes() + return (get_config().get("enable_cython_pairwise_dist", True) and + not issparse(X) and not issparse(Y) and dtypes_validity and + metric in cls.valid_metrics()) + + +cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): + """Compute the argkmin of row vectors of X on the ones of Y. + + For each row vector of X, computes the indices of k first the rows + vectors of Y with the smallest distances. + + PairwiseDistancesArgKmin is typically used to perform + bruteforce k-nearest neighbors queries. + + Parameters + ---------- + datasets_pair: DatasetsPair + The dataset pairs (X, Y) for the reduction. + + chunk_size: int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + k: int, default=1 + The k for the argkmin reduction. + """ + + @classmethod + def compute( + cls, + X, + Y, + ITYPE_t k, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + bint return_distance=False, + ): + """Return the results of the reduction for the given arguments. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + k : int + The k for the argkmin reduction. + + metric : str, default='euclidean' + The distance metric to use for argkmin. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` + is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity + for parallelism and is therefore more efficient despite the synchronization + step at each iteration of the outer loop on chunks of `X`. + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + return_distance : boolean, default=False + Return distances between each X vector and its + argkmin if set to True. + + Returns + ------- + If return_distance=False: + - argkmin_indices : ndarray of shape (n_samples_X, k) + Indices of the argkmin for each vector in X. + + If return_distance=True: + - argkmin_distances : ndarray of shape (n_samples_X, k) + Distances to the argkmin for each vector in X. + - argkmin_indices : ndarray of shape (n_samples_X, k) + Indices of the argkmin for each vector in X. + + Notes + ----- + This public classmethod is responsible for introspecting the arguments + values to dispatch to the proper implementations of + :meth:`PairwiseDistancesArgKmin` given the types. + + All temporarily allocated datastructures necessary for the concrete + implementation are therefore freed when this classmethod returns. + + This allows decoupling the interface entirely from the + implementation details whilst maintaining RAII. + """ +{{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}} + if X.dtype == Y.dtype == np.float{{bitness}}: + return PairwiseDistancesArgKmin{{bitness}}.compute( + X=X, + Y=Y, + k=k, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + return_distance=return_distance, + ) +{{endfor}} + raise ValueError( + f"Datasets must both be of np.float64 or np.float32 dtype. " + f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}." + ) + +cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): + """Compute radius-based neighbors for two sets of vectors. + + For each row-vector X[i] of the queries X, find all the indices j of + row-vectors in Y such that: + + dist(X[i], Y[j]) <= radius + + The distance function `dist` depends on the values of the `metric` + and `metric_kwargs` parameters. + + Parameters + ---------- + datasets_pair: DatasetsPair + The dataset pair (X, Y) for the reduction. + + chunk_size: int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + radius: float + The radius defining the neighborhood. + """ + + @classmethod + def compute( + cls, + X, + Y, + DTYPE_t radius, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + bint return_distance=False, + bint sort_results=False, + ): + """Return the results of the reduction for the given arguments. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + radius : float + The radius defining the neighborhood. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` + is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity + for parallelism and is therefore more efficient despite the synchronization + step at each iteration of the outer loop on chunks of `X`. + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + return_distance : boolean, default=False + Return distances between each X vector and its neighbors if set to True. + + sort_results : boolean, default=False + Sort results with respect to distances between each X vector and its + neighbors if set to True. + + Returns + ------- + If return_distance=False: + - neighbors_indices : ndarray of n_samples_X ndarray + Indices of the neighbors for each vector in X. + + If return_distance=True: + - neighbors_indices : ndarray of n_samples_X ndarray + Indices of the neighbors for each vector in X. + - neighbors_distances : ndarray of n_samples_X ndarray + Distances to the neighbors for each vector in X. + + Notes + ----- + This public classmethod is responsible for introspecting the arguments + values to dispatch to the private + :meth:`PairwiseDistancesRadiusNeighborhood._compute` instance method of + the most appropriate :class:`PairwiseDistancesRadiusNeighborhood` + concrete implementation. + + All temporarily allocated datastructures necessary for the concrete + implementation are therefore freed when this classmethod returns. + + This allows entirely decoupling the interface entirely from the + implementation details whilst maintaining RAII. + """ +{{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}} + if X.dtype == Y.dtype == np.float{{bitness}}: + return PairwiseDistancesRadiusNeighborhood{{bitness}}.compute( + X=X, + Y=Y, + radius=radius, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + sort_results=sort_results, + return_distance=return_distance, + ) +{{endfor}} + raise ValueError( + f"Datasets must both be of np.float64 or np.float32 dtype. " + f"Currently: X.dtype={X.dtype} and Y.dtype={Y.dtype}." + ) + +cpdef DTYPE_t[::1] _sqeuclidean_row_norms64( + const DTYPE_t[:, ::1] X, + ITYPE_t num_threads, +): + """Compute the squared euclidean norm of the rows of X in parallel. + + This is faster than using np.einsum("ij, ij->i") even when using a single thread. + """ + cdef: + # Casting for X to remove the const qualifier is needed because APIs + # exposed via scipy.linalg.cython_blas aren't reflecting the arguments' + # const qualifier. + # See: https://github.com/scipy/scipy/issues/14262 + DTYPE_t * X_ptr = &X[0, 0] + ITYPE_t i = 0 + ITYPE_t n = X.shape[0] + ITYPE_t d = X.shape[1] + DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE) + + for i in prange(n, schedule='static', nogil=True, num_threads=num_threads): + squared_row_norms[i] = _dot(d, X_ptr + i * d, 1, X_ptr + i * d, 1) + + return squared_row_norms + + +cpdef DTYPE_t[::1] _sqeuclidean_row_norms32( + const cnp.float32_t[:, ::1] X, + ITYPE_t num_threads, +): + """Compute the squared euclidean norm of the rows of X in parallel. + + This is faster than using np.einsum("ij, ij->i") even when using a single thread. + """ + cdef: + # Casting for X to remove the const qualifier is needed because APIs + # exposed via scipy.linalg.cython_blas aren't reflecting the arguments' + # const qualifier. + # See: https://github.com/scipy/scipy/issues/14262 + cnp.float32_t * X_ptr = &X[0, 0] + ITYPE_t i = 0, j = 0 + ITYPE_t n = X.shape[0] + ITYPE_t d = X.shape[1] + DTYPE_t[::1] squared_row_norms = np.empty(n, dtype=DTYPE) + + # To upcast the i-th row of X from 32bit to 64bit + DTYPE_t * X_idx_upcast_ptr + + with nogil, parallel(num_threads=num_threads): + # Thread-local buffer allocation + X_i_upcast_ptr = malloc(sizeof(DTYPE_t) * d) + for i in prange(n, schedule='static'): + + # Upcasting the i-th row of X from 32bit to 64bit + for j in range(d): + X_i_upcast_ptr[j] = deref(X_ptr + i * d + j) + + squared_row_norms[i] = _dot(d, X_i_upcast_ptr, 1, X_i_upcast_ptr, 1) + + free(X_i_upcast_ptr) + + return squared_row_norms + +##################### +# dtype-specific implementations: +# For each dtype, an implementation of `PairwiseDistancesReductions` are generated by Tempita. +# Computations are dispatched to them at runtime via the interfaces defined above. + +{{for distance_suffix, bitness, DTYPE_t, DTYPE, need_upcast in implementation_specific_values}} + +from ._dist_metrics cimport DatasetsPair{{distance_suffix}}, DenseDenseDatasetsPair{{distance_suffix}} + +cdef class PairwiseDistancesReduction{{bitness}}(PairwiseDistancesReduction): + """{{bitness}}bit implementation of PairwiseDistancesReduction.""" + + cdef: + readonly DatasetsPair{{distance_suffix}} datasets_pair + + # The number of threads that can be used is stored in effective_n_threads. + # + # The number of threads to use in the parallelisation strategy + # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads: + # for small datasets, less threads might be needed to loop over pair of chunks. + # + # Hence the number of threads that _will_ be used for looping over chunks + # is stored in chunks_n_threads, allowing solely using what we need. + # + # Thus, an invariant is: + # + # chunks_n_threads <= effective_n_threads + # + ITYPE_t effective_n_threads + ITYPE_t chunks_n_threads + + ITYPE_t n_samples_chunk, chunk_size - metric : str, default='euclidean' - The distance metric to use. - For a list of available metrics, see the documentation of - :class:`~sklearn.metrics.DistanceMetric`. + ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk + ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk - Returns - ------- - True if the PairwiseDistancesReduction can be used, else False. - """ - # TODO: support sparse arrays and 32 bits - return (get_config().get("enable_cython_pairwise_dist", True) and - not issparse(X) and X.dtype == np.float64 and - not issparse(Y) and Y.dtype == np.float64 and - metric in cls.valid_metrics()) + bint execute_in_parallel_on_Y def __init__( self, - DatasetsPair datasets_pair, + DatasetsPair{{distance_suffix}} datasets_pair, chunk_size=None, strategy=None, ): @@ -348,7 +740,8 @@ cdef class PairwiseDistancesReduction: X_end = X_start + self.X_n_samples_chunk # Reinitializing thread datastructures for the new X chunk - self._parallel_on_X_init_chunk(thread_num, X_start) + # Eventually upcast X[X_start:X_end] to 64bit + self._parallel_on_X_init_chunk(thread_num, X_start, X_end) for Y_chunk_idx in range(self.Y_n_chunks): Y_start = Y_chunk_idx * self.Y_n_samples_chunk @@ -357,6 +750,13 @@ cdef class PairwiseDistancesReduction: else: Y_end = Y_start + self.Y_n_samples_chunk + # Eventually upcast Y[Y_start:Y_end] to 64bit + self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self._compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, @@ -409,7 +809,8 @@ cdef class PairwiseDistancesReduction: thread_num = _openmp_thread_num() # Initializing datastructures used in this thread - self._parallel_on_Y_parallel_init(thread_num) + # Eventually upcast X[X_start:X_end] to 64bit + self._parallel_on_Y_parallel_init(thread_num, X_start, X_end) for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'): Y_start = Y_chunk_idx * self.Y_n_samples_chunk @@ -418,6 +819,13 @@ cdef class PairwiseDistancesReduction: else: Y_end = Y_start + self.Y_n_samples_chunk + # Eventually upcast Y[Y_start:Y_end] to 64bit + self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self._compute_and_reduce_distances_on_chunks( X_start, X_end, Y_start, Y_end, @@ -450,8 +858,9 @@ cdef class PairwiseDistancesReduction: ) nogil: """Compute the pairwise distances on two chunks of X and Y and reduce them. - This is THE core computational method of PairwiseDistanceReductions. - This must be implemented in subclasses. + This is THE core computational method of PairwiseDistanceReductions{{bitness}}. + This must be implemented in subclasses agnostically from the parallelisation + strategies. """ return @@ -479,10 +888,25 @@ cdef class PairwiseDistancesReduction: self, ITYPE_t thread_num, ITYPE_t X_start, + ITYPE_t X_end, ) nogil: """Initialise datastructures used in a thread given its number.""" return + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks. + + This is eventually used to upcast X[X_start:X_end] to 64bit. + """ + return + cdef void _parallel_on_X_prange_iter_finalize( self, ITYPE_t thread_num, @@ -508,10 +932,26 @@ cdef class PairwiseDistancesReduction: cdef void _parallel_on_Y_parallel_init( self, ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, ) nogil: """Initialise datastructures used in a thread given its number.""" return + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + """Initialise datastructures just before the _compute_and_reduce_distances_on_chunks. + + This is eventually used to upcast Y[Y_start:Y_end] to 64bit. + """ + return + cdef void _parallel_on_Y_synchronize( self, ITYPE_t X_start, @@ -526,28 +966,8 @@ cdef class PairwiseDistancesReduction: """Update datastructures after executing all the reductions.""" return -cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): - """Compute the argkmin of row vectors of X on the ones of Y. - - For each row vector of X, computes the indices of k first the rows - vectors of Y with the smallest distances. - - PairwiseDistancesArgKmin is typically used to perform - bruteforce k-nearest neighbors queries. - - Parameters - ---------- - datasets_pair: DatasetsPair - The dataset pairs (X, Y) for the reduction. - - chunk_size: int, default=None, - The number of vectors per chunk. If None (default) looks-up in - scikit-learn configuration for `pairwise_dist_chunk_size`, - and use 256 if it is not set. - - k: int, default=1 - The k for the argkmin reduction. - """ +cdef class PairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesReduction{{bitness}}): + """{{bitness}}bit implementation of PairwiseDistancesArgKmin.""" cdef: ITYPE_t k @@ -644,14 +1064,14 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): Notes ----- This public classmethod is responsible for introspecting the arguments - values to dispatch to the private :meth:`PairwiseDistancesArgKmin._compute` - instance method of the most appropriate :class:`PairwiseDistancesArgKmin` + values to dispatch to the private :meth:`PairwiseDistancesArgKmin{{bitness}}._compute` + instance method of the most appropriate :class:`PairwiseDistancesArgKmin{{bitness}}` concrete implementation. All temporarily allocated datastructures necessary for the concrete implementation are therefore freed when this classmethod returns. - This allows entirely decoupling the interface entirely from the + This allows decoupling the interface entirely from the implementation details whilst maintaining RAII. """ # Note (jjerphan): Some design thoughts for future extensions. @@ -669,7 +1089,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): # at time to leverage a call to the BLAS GEMM routine as explained # in more details in the docstring. use_squared_distances = metric == "sqeuclidean" - pda = FastEuclideanPairwiseDistancesArgKmin( + pda = FastEuclideanPairwiseDistancesArgKmin{{bitness}}( X=X, Y=Y, k=k, use_squared_distances=use_squared_distances, chunk_size=chunk_size, @@ -679,8 +1099,8 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): else: # Fall back on a generic implementation that handles most scipy # metrics by computing the distances between 2 vectors at a time. - pda = PairwiseDistancesArgKmin( - datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), + pda = PairwiseDistancesArgKmin{{bitness}}( + datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric, metric_kwargs), k=k, chunk_size=chunk_size, strategy=strategy, @@ -698,7 +1118,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): def __init__( self, - DatasetsPair datasets_pair, + DatasetsPair{{distance_suffix}} datasets_pair, chunk_size=None, strategy=None, ITYPE_t k=1, @@ -726,7 +1146,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): sizeof(ITYPE_t *) * self.chunks_n_threads ) - # Main heaps which will be returned as results by `PairwiseDistancesArgKmin.compute`. + # Main heaps which will be returned as results by `PairwiseDistancesArgKmin{{bitness}}.compute`. self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=ITYPE) self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=DTYPE) @@ -764,11 +1184,11 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): Y_start + j, ) - @final cdef void _parallel_on_X_init_chunk( self, ITYPE_t thread_num, ITYPE_t X_start, + ITYPE_t X_end, ) nogil: # As this strategy is embarrassingly parallel, we can set each # thread's heaps pointer to the proper position on the main heaps. @@ -819,10 +1239,11 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): heaps_size * sizeof(ITYPE_t) ) - @final cdef void _parallel_on_Y_parallel_init( self, ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, ) nogil: # Initialising heaps (memset can't be used here) for idx in range(self.X_n_samples_chunk * self.k): @@ -899,17 +1320,17 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): # Values are returned identically to the way `KNeighborsMixin.kneighbors` # returns values. This is counter-intuitive but this allows not using - # complex adaptations where `PairwiseDistancesArgKmin.compute` is called. + # complex adaptations where `PairwiseDistancesArgKmin{{bitness}}.compute` is called. return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices) return np.asarray(self.argkmin_indices) -cdef class GEMMTermComputer: +cdef class GEMMTermComputer{{bitness}}: """Component for `FastEuclidean*` variant wrapping the logic for the call to GEMM. `FastEuclidean*` classes internally compute the squared Euclidean distances between - chunks of vectors X_c and Y_c using using the decomposition: + chunks of vectors X_c and Y_c using the following decomposition: ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² @@ -919,42 +1340,146 @@ cdef class GEMMTermComputer: the middle term `- 2 X_c_i.Y_c_j^T` with a call to GEMM, which has a high arithmetic intensity. """ - cdef: - const DTYPE_t[:, ::1] X - const DTYPE_t[:, ::1] Y + const {{DTYPE_t}}[:, ::1] X + const {{DTYPE_t}}[:, ::1] Y ITYPE_t effective_n_threads ITYPE_t chunks_n_threads ITYPE_t dist_middle_terms_chunks_size + ITYPE_t n_features + ITYPE_t chunk_size # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM vector[vector[DTYPE_t]] dist_middle_terms_chunks +{{if need_upcast}} + # Buffers for upcasting chunks of X and Y from 32bit to 64bit + vector[vector[DTYPE_t]] X_c_upcast + vector[vector[DTYPE_t]] Y_c_upcast +{{endif}} + def __init__(self, - DTYPE_t[:, ::1] X, - DTYPE_t[:, ::1] Y, + {{DTYPE_t}}[:, ::1] X, + {{DTYPE_t}}[:, ::1] Y, ITYPE_t effective_n_threads, ITYPE_t chunks_n_threads, ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, ): self.X = X self.Y = Y self.effective_n_threads = effective_n_threads self.chunks_n_threads = chunks_n_threads self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size + self.n_features = n_features + self.chunk_size = chunk_size self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) +{{if need_upcast}} + # We populate the buffer for upcasting chunks of X and Y from 32bit to 64bit. + self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) + self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) + + upcast_buffer_n_elements = self.chunk_size * n_features + + for thread_num in range(self.effective_n_threads): + self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements) + self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements) +{{endif}} + + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: +{{if need_upcast}} + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = Y_end - Y_start + + # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] +{{else}} + return +{{endif}} + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: +{{if need_upcast}} + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = X_end - X_start + + # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] +{{else}} + return +{{endif}} + cdef void _parallel_on_Y_init(self) nogil: for thread_num in range(self.chunks_n_threads): self.dist_middle_terms_chunks[thread_num].resize( self.dist_middle_terms_chunks_size ) + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: +{{if need_upcast}} + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = X_end - X_start + + # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] +{{else}} + return +{{endif}} + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil: +{{if need_upcast}} + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = Y_end - Y_start + + # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] +{{else}} + return +{{endif}} + cdef DTYPE_t * _compute_distances_on_chunks( self, ITYPE_t X_start, @@ -966,9 +1491,8 @@ cdef class GEMMTermComputer: cdef: ITYPE_t i, j DTYPE_t squared_dist_i_j - - const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :] - const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] + const {{DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :] + const {{DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :] DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() # Careful: LDA, LDB and LDC are given for F-ordered arrays @@ -983,12 +1507,17 @@ cdef class GEMMTermComputer: ITYPE_t n = Y_c.shape[0] ITYPE_t K = X_c.shape[1] DTYPE_t alpha = - 2. +{{if need_upcast}} + DTYPE_t * A = self.X_c_upcast[thread_num].data() + DTYPE_t * B = self.Y_c_upcast[thread_num].data() +{{else}} # Casting for A and B to remove the const is needed because APIs exposed via # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. # See: https://github.com/scipy/scipy/issues/14262 DTYPE_t * A = &X_c[0, 0] - ITYPE_t lda = X_c.shape[1] DTYPE_t * B = &Y_c[0, 0] +{{endif}} + ITYPE_t lda = X_c.shape[1] ITYPE_t ldb = X_c.shape[1] DTYPE_t beta = 0. ITYPE_t ldc = Y_c.shape[0] @@ -999,25 +1528,11 @@ cdef class GEMMTermComputer: return dist_middle_terms -cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): - """Fast specialized variant for PairwiseDistancesArgKmin on EuclideanDistance. - - The full pairwise squared distances matrix is computed as follows: - - ||X - Y||² = ||X||² - 2 X.Y^T + ||Y||² - - The middle term gets computed efficiently below using BLAS Level 3 GEMM. - - Notes - ----- - This implementation has a superior arithmetic intensity and hence - better running time when the variant is IO bound, but it can suffer - from numerical instability caused by catastrophic cancellation potentially - introduced by the subtraction in the arithmetic expression above. - """ - +cdef class FastEuclideanPairwiseDistancesArgKmin{{bitness}}(PairwiseDistancesArgKmin{{bitness}}): + """Fast specialized alternative for PairwiseDistancesArgKmin{{bitness}} on EuclideanDistance.""" cdef: - GEMMTermComputer gemm_term_computer + GEMMTermComputer{{bitness}} gemm_term_computer + const DTYPE_t[::1] X_norm_squared const DTYPE_t[::1] Y_norm_squared @@ -1025,7 +1540,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): @classmethod def is_usable_for(cls, X, Y, metric) -> bool: - return (PairwiseDistancesArgKmin.is_usable_for(X, Y, metric) and + return (PairwiseDistancesArgKmin{{bitness}}.is_usable_for(X, Y, metric) and not _in_unstable_openblas_configuration()) def __init__( @@ -1052,57 +1567,125 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): super().__init__( # The datasets pair here is used for exact distances computations - datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"), + datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric="euclidean"), chunk_size=chunk_size, strategy=strategy, k=k, ) - # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair + # X and Y are checked by the DatasetsPair{{distance_suffix}} implemented as a DenseDenseDatasetsPair{{distance_suffix}} cdef: - DenseDenseDatasetsPair datasets_pair = self.datasets_pair + DenseDenseDatasetsPair{{distance_suffix}} datasets_pair = ( + self.datasets_pair + ) ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - self.gemm_term_computer = GEMMTermComputer( + self.gemm_term_computer = GEMMTermComputer{{bitness}}( datasets_pair.X, datasets_pair.Y, self.effective_n_threads, self.chunks_n_threads, dist_middle_terms_chunks_size, + n_features=datasets_pair.X.shape[1], + chunk_size=self.chunk_size, ) if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") else: - self.Y_norm_squared = _sqeuclidean_row_norms(datasets_pair.Y, self.effective_n_threads) + self.Y_norm_squared = _sqeuclidean_row_norms{{bitness}}(datasets_pair.Y, self.effective_n_threads) # Do not recompute norms if datasets are identical. self.X_norm_squared = ( self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms(datasets_pair.X, self.effective_n_threads) + _sqeuclidean_row_norms{{bitness}}(datasets_pair.X, self.effective_n_threads) ) self.use_squared_distances = use_squared_distances @final cdef void compute_exact_distances(self) nogil: if not self.use_squared_distances: - PairwiseDistancesArgKmin.compute_exact_distances(self) + PairwiseDistancesArgKmin{{bitness}}.compute_exact_distances(self) @final cdef void _parallel_on_X_parallel_init( self, ITYPE_t thread_num, ) nogil: - PairwiseDistancesArgKmin._parallel_on_X_parallel_init(self, thread_num) + PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_parallel_init(self, thread_num) self.gemm_term_computer._parallel_on_X_parallel_init(thread_num) + + @final + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) + self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) + + + @final + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesArgKmin{{bitness}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num, + ) + + @final cdef void _parallel_on_Y_init( self, ) nogil: cdef ITYPE_t thread_num - PairwiseDistancesArgKmin._parallel_on_Y_init(self) + PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() + + @final + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) + self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) + + + @final + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesArgKmin{{bitness}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num + ) + + @final cdef void _compute_and_reduce_distances_on_chunks( self, @@ -1145,7 +1728,7 @@ cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): ) -cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): +cdef class PairwiseDistancesRadiusNeighborhood{{bitness}}(PairwiseDistancesReduction{{bitness}}): """Compute radius-based neighbors for two sets of vectors. For each row-vector X[i] of the queries X, find all the indices j of @@ -1321,7 +1904,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): # at time to leverage a call to the BLAS GEMM routine as explained # in more details in the docstring. use_squared_distances = metric == "sqeuclidean" - pda = FastEuclideanPairwiseDistancesRadiusNeighborhood( + pda = FastEuclideanPairwiseDistancesRadiusNeighborhood{{bitness}}( X=X, Y=Y, radius=radius, use_squared_distances=use_squared_distances, chunk_size=chunk_size, @@ -1332,8 +1915,8 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): else: # Fall back on a generic implementation that handles most scipy # metrics by computing the distances between 2 vectors at a time. - pda = PairwiseDistancesRadiusNeighborhood( - datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), + pda = PairwiseDistancesRadiusNeighborhood{{bitness}}( + datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric, metric_kwargs), radius=radius, chunk_size=chunk_size, metric_kwargs=metric_kwargs, @@ -1354,7 +1937,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): def __init__( self, - DatasetsPair datasets_pair, + DatasetsPair{{distance_suffix}} datasets_pair, DTYPE_t radius, chunk_size=None, strategy=None, @@ -1423,11 +2006,11 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): return coerce_vectors_to_nd_arrays(self.neigh_indices) - @final cdef void _parallel_on_X_init_chunk( self, ITYPE_t thread_num, ITYPE_t X_start, + ITYPE_t X_end, ) nogil: # As this strategy is embarrassingly parallel, we can set the @@ -1546,7 +2129,7 @@ cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): ) -cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRadiusNeighborhood): +cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood{{bitness}}(PairwiseDistancesRadiusNeighborhood{{bitness}}): """Fast specialized variant for PairwiseDistancesRadiusNeighborhood on EuclideanDistance. The full pairwise squared distances matrix is computed as follows: @@ -1565,7 +2148,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad """ cdef: - GEMMTermComputer gemm_term_computer + GEMMTermComputer{{bitness}} gemm_term_computer const DTYPE_t[::1] X_norm_squared const DTYPE_t[::1] Y_norm_squared @@ -1573,7 +2156,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad @classmethod def is_usable_for(cls, X, Y, metric) -> bool: - return (PairwiseDistancesRadiusNeighborhood.is_usable_for(X, Y, metric) + return (PairwiseDistancesRadiusNeighborhood{{bitness}}.is_usable_for(X, Y, metric) and not _in_unstable_openblas_configuration()) def __init__( @@ -1601,7 +2184,7 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad super().__init__( # The datasets pair here is used for exact distances computations - datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"), + datasets_pair=DatasetsPair{{distance_suffix}}.get_for(X, Y, metric="euclidean"), radius=radius, chunk_size=chunk_size, strategy=strategy, @@ -1610,26 +2193,28 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad ) # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair cdef: - DenseDenseDatasetsPair datasets_pair = self.datasets_pair + DenseDenseDatasetsPair{{distance_suffix}} datasets_pair = self.datasets_pair ITYPE_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk - self.gemm_term_computer = GEMMTermComputer( + self.gemm_term_computer = GEMMTermComputer{{bitness}}( datasets_pair.X, datasets_pair.Y, self.effective_n_threads, self.chunks_n_threads, dist_middle_terms_chunks_size, + n_features=datasets_pair.X.shape[1], + chunk_size=self.chunk_size, ) if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: self.Y_norm_squared = metric_kwargs.pop("Y_norm_squared") else: - self.Y_norm_squared = _sqeuclidean_row_norms(datasets_pair.Y, self.effective_n_threads) + self.Y_norm_squared = _sqeuclidean_row_norms{{bitness}}(datasets_pair.Y, self.effective_n_threads) # Do not recompute norms if datasets are identical. self.X_norm_squared = ( self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms(datasets_pair.X, self.effective_n_threads) + _sqeuclidean_row_norms{{bitness}}(datasets_pair.X, self.effective_n_threads) ) self.use_squared_distances = use_squared_distances @@ -1638,27 +2223,85 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad # already considered to be the adapted radius, so we overwrite it. self.r_radius = radius - @final - cdef void compute_exact_distances(self) nogil: - if not self.use_squared_distances: - PairwiseDistancesRadiusNeighborhood.compute_exact_distances(self) - @final cdef void _parallel_on_X_parallel_init( self, ITYPE_t thread_num, ) nogil: - PairwiseDistancesRadiusNeighborhood._parallel_on_X_parallel_init(self, thread_num) + PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_X_parallel_init(self, thread_num) self.gemm_term_computer._parallel_on_X_parallel_init(thread_num) + @final + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) + self.gemm_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) + + @final + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.gemm_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num, + ) + @final cdef void _parallel_on_Y_init( self, ) nogil: cdef ITYPE_t thread_num - PairwiseDistancesRadiusNeighborhood._parallel_on_Y_init(self) + PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() + @final + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) + self.gemm_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) + + @final + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesRadiusNeighborhood{{bitness}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.gemm_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num + ) + + @final + cdef void compute_exact_distances(self) nogil: + if not self.use_squared_distances: + PairwiseDistancesRadiusNeighborhood{{bitness}}.compute_exact_distances(self) + @final cdef void _compute_and_reduce_distances_on_chunks( self, @@ -1692,3 +2335,4 @@ cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRad if squared_dist_i_j <= self.r_radius: deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j) deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start) +{{endfor}} diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py index fc912068cb6c4..c343cadb2c258 100644 --- a/sklearn/metrics/setup.py +++ b/sklearn/metrics/setup.py @@ -24,6 +24,7 @@ def configuration(parent_package="", top_path=None): templates = [ "sklearn/metrics/_dist_metrics.pyx.tp", "sklearn/metrics/_dist_metrics.pxd.tp", + "sklearn/metrics/_pairwise_distances_reduction.pyx.tp", ] gen_from_templates(templates) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 192f7ef43a6c6..7bfd89dc06800 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -9,7 +9,8 @@ PairwiseDistancesReduction, PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood, - _sqeuclidean_row_norms, + _sqeuclidean_row_norms64, + _sqeuclidean_row_norms32, ) from sklearn.metrics import euclidean_distances @@ -66,7 +67,7 @@ def _get_metric_params_list(metric: str, n_features: int, seed: int = 1): return [{}] -def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices): +def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1e-7): assert_array_equal( ref_indices, indices, @@ -76,10 +77,69 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices): ref_dist, dist, err_msg="Query vectors have different neighbors' distances", - rtol=1e-7, + rtol=rtol, ) +def assert_argkmin_results_quasi_equality( + ref_dist, dist, ref_indices, indices, rtol=1e-4 +): + + ref_dist, dist, ref_indices, indices = map( + np.ndarray.flatten, [ref_dist, dist, ref_indices, indices] + ) + + assert ( + len(ref_dist) == len(dist) == len(ref_indices) == len(indices) + ), "Arrays of results have various length." + + n = len(ref_dist) + + skip_permutation_check = False + + for i in range(n - 1): + # We test the equality of pair of adjacent indices and distances + # of the references against the results. + rd_prev, rd_current, rd_next = ref_dist[i - 1], ref_dist[i], ref_dist[i + 1] + d_prev, d_current, d_next = dist[i - 1], dist[i], dist[i + 1] + ri_prev, ri_current, ri_next = ( + ref_indices[i - 1], + ref_indices[i], + ref_indices[i + 1], + ) + i_prev, i_current, i_next = indices[i - 1], indices[i], indices[i + 1] + + assert np.isclose( + d_current, rd_current, rtol=rtol + ), "Query vectors have different neighbors' distances" + + if ri_current != i_current: + # If the current reference index and index are different, + # it might be that their were permuted because their distances + # are relatively close to each other. + # In this case, we need to check for a valid permutation. + valid_permutation = ( + np.isclose(d_current, d_next, rtol=rtol) + and i_next == ri_current + and ri_next == i_current + ) + assert skip_permutation_check or valid_permutation, ( + "Query vectors have different neighbors' indices \n" + f"(i_prev, i_current, i_next) = {i_prev, i_current, i_next} \n" + f"(ri_prev, ri_current, ri_next) = {ri_prev, ri_current, ri_next} \n" + f"(d_prev, d_current, d_next) = {d_prev, d_current, d_next} \n" + f"(rd_prev, rd_current, rd_next) = {rd_prev, rd_current, rd_next} \n" + ) + # If there's a permutation at this iteration, we need to + # skip the following permutation check. + skip_permutation_check = True + continue + + # We need to check for potential permutations for the next iterations. + if skip_permutation_check: + skip_permutation_check = False + + def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices): # We get arrays of arrays and we need to check for individual pairs for i in range(ref_dist.shape[0]): @@ -97,8 +157,20 @@ def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, ind ASSERT_RESULT = { - PairwiseDistancesArgKmin: assert_argkmin_results_equality, - PairwiseDistancesRadiusNeighborhood: assert_radius_neighborhood_results_equality, + # In the case of 64bit, we test for exact equality. + (PairwiseDistancesArgKmin, np.float64): assert_argkmin_results_equality, + ( + PairwiseDistancesRadiusNeighborhood, + np.float64, + ): assert_radius_neighborhood_results_equality, + # In the case of 32bit, indices can be permuted due to small difference + # in the computations of their associated distances, hence we test equality of + # results up to valid permutations. + (PairwiseDistancesArgKmin, np.float32): assert_argkmin_results_quasi_equality, + ( + PairwiseDistancesRadiusNeighborhood, + np.float32, + ): assert_radius_neighborhood_results_equality, } @@ -107,13 +179,18 @@ def test_pairwise_distances_reduction_is_usable_for(): X = rng.rand(100, 10) Y = rng.rand(100, 10) metric = "euclidean" - assert PairwiseDistancesReduction.is_usable_for(X, Y, metric) + + assert PairwiseDistancesReduction.is_usable_for( + X.astype(np.float64), X.astype(np.float64), metric + ) + assert PairwiseDistancesReduction.is_usable_for( + X.astype(np.float32), X.astype(np.float32), metric + ) assert not PairwiseDistancesReduction.is_usable_for( X.astype(np.int64), Y.astype(np.int64), metric ) assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc") - # TODO: remove once 32 bits datasets are supported assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric) assert not PairwiseDistancesReduction.is_usable_for(X, Y.astype(np.int32), metric) @@ -130,8 +207,8 @@ def test_argkmin_factory_method_wrong_usages(): metric = "euclidean" msg = ( - "Only 64bit float datasets are supported at this time, " - "got: X.dtype=float32 and Y.dtype=float64" + "Datasets must both be of np.float64 or np.float32 dtype. Currently: " + "X.dtype=float32 and Y.dtype=float64." ) with pytest.raises(ValueError, match=msg): PairwiseDistancesArgKmin.compute( @@ -139,8 +216,8 @@ def test_argkmin_factory_method_wrong_usages(): ) msg = ( - "Only 64bit float datasets are supported at this time, " - "got: X.dtype=float64 and Y.dtype=int32" + "Datasets must both be of np.float64 or np.float32 dtype. Currently: " + "X.dtype=float64 and Y.dtype=int32" ) with pytest.raises(ValueError, match=msg): PairwiseDistancesArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric) @@ -171,7 +248,7 @@ def test_argkmin_factory_method_wrong_usages(): message = ( r"Some metric_kwargs have been passed \({'p': 3}\) but aren't usable for this" r" case \(" - r"FastEuclideanPairwiseDistancesArgKmin\) and will be ignored." + r"FastEuclideanPairwiseDistancesArgKmin." ) with pytest.warns(UserWarning, match=message): @@ -190,8 +267,8 @@ def test_radius_neighborhood_factory_method_wrong_usages(): with pytest.raises( ValueError, match=( - "Only 64bit float datasets are supported at this time, " - "got: X.dtype=float32 and Y.dtype=float64" + "Datasets must both be of np.float64 or np.float32 dtype. " + "Currently: X.dtype=float32 and Y.dtype=float64" ), ): PairwiseDistancesRadiusNeighborhood.compute( @@ -201,8 +278,8 @@ def test_radius_neighborhood_factory_method_wrong_usages(): with pytest.raises( ValueError, match=( - "Only 64bit float datasets are supported at this time, " - "got: X.dtype=float64 and Y.dtype=int32" + "Datasets must both be of np.float64 or np.float32 dtype. " + "Currently: X.dtype=float64 and Y.dtype=int32" ), ): PairwiseDistancesRadiusNeighborhood.compute( @@ -233,8 +310,7 @@ def test_radius_neighborhood_factory_method_wrong_usages(): message = ( r"Some metric_kwargs have been passed \({'p': 3}\) but aren't usable for this" - r" case \(FastEuclideanPairwiseDistancesRadiusNeighborhood\) and will be" - r" ignored." + r" case \(FastEuclideanPairwiseDistancesRadiusNeighborhood" ) with pytest.warns(UserWarning, match=message): @@ -245,6 +321,7 @@ def test_radius_neighborhood_factory_method_wrong_usages(): @pytest.mark.parametrize("n_samples", [100, 1000]) @pytest.mark.parametrize("chunk_size", [50, 512, 1024]) +@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes()) @pytest.mark.parametrize( "PairwiseDistancesReduction", [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], @@ -254,8 +331,8 @@ def test_chunk_size_agnosticism( PairwiseDistancesReduction, n_samples, chunk_size, + dtype, n_features=100, - dtype=np.float64, ): # Results should not depend on the chunk size rng = np.random.RandomState(global_random_seed) @@ -274,6 +351,7 @@ def test_chunk_size_agnosticism( X, Y, parameter, + metric="manhattan", return_distance=True, ) @@ -282,25 +360,27 @@ def test_chunk_size_agnosticism( Y, parameter, chunk_size=chunk_size, + metric="manhattan", return_distance=True, ) - ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices) + ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( + ref_dist, dist, ref_indices, indices + ) @pytest.mark.parametrize("n_samples", [100, 1000]) -@pytest.mark.parametrize("chunk_size", [50, 512, 1024]) +@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes()) @pytest.mark.parametrize( "PairwiseDistancesReduction", [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], ) def test_n_threads_agnosticism( global_random_seed, + dtype, PairwiseDistancesReduction, n_samples, - chunk_size, n_features=100, - dtype=np.float64, ): # Results should not depend on the number of threads rng = np.random.RandomState(global_random_seed) @@ -327,13 +407,62 @@ def test_n_threads_agnosticism( X, Y, parameter, return_distance=True ) - ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices) + ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( + ref_dist, dist, ref_indices, indices + ) + + +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics()) +@pytest.mark.parametrize( + "PairwiseDistancesReduction", + [PairwiseDistancesArgKmin], +) +def test_dtype_agnosticism( + PairwiseDistancesReduction, + seed, + metric, + n_samples=1000, + n_features=100, +): + rng = np.random.RandomState(seed) + spread = 100 + X64 = rng.rand(n_samples, n_features).astype(np.float64) * spread + Y64 = rng.rand(n_samples, n_features).astype(np.float64) * spread + X32 = X64.astype(np.float32) + Y32 = Y64.astype(np.float32) + + parameter = ( + 10 + if PairwiseDistancesReduction is PairwiseDistancesArgKmin + # Scaling the radius slightly with the numbers of dimensions + else 10 ** np.log(n_features) + ) + + ref_dist, ref_indices = PairwiseDistancesReduction.compute( + X64, + Y64, + parameter, + return_distance=True, + ) + + dist, indices = PairwiseDistancesReduction.compute( + X32, Y32, parameter, return_distance=True + ) + + # We check results against np.float32 because we inherently + # loose the information from np.float64. + dist = dist.astype(ref_dist.dtype) + ASSERT_RESULT[(PairwiseDistancesReduction, np.float32)]( + ref_dist, dist, ref_indices, indices + ) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("n_samples", [100, 1000]) @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics()) +@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes()) @pytest.mark.parametrize( "PairwiseDistancesReduction", [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], @@ -343,8 +472,8 @@ def test_strategies_consistency( PairwiseDistancesReduction, metric, n_samples, + dtype, n_features=10, - dtype=np.float64, ): rng = np.random.RandomState(global_random_seed) @@ -394,7 +523,7 @@ def test_strategies_consistency( return_distance=True, ) - ASSERT_RESULT[PairwiseDistancesReduction]( + ASSERT_RESULT[(PairwiseDistancesReduction, dtype)]( dist_par_X, dist_par_Y, indices_par_X, @@ -409,6 +538,7 @@ def test_strategies_consistency( @pytest.mark.parametrize("n_features", [50, 500]) @pytest.mark.parametrize("translation", [0, 1e6]) @pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS) +@pytest.mark.parametrize("dtype", PairwiseDistancesReduction.valid_dtypes()) @pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y")) def test_pairwise_distances_argkmin( global_random_seed, @@ -416,9 +546,9 @@ def test_pairwise_distances_argkmin( translation, metric, strategy, + dtype, n_samples=100, k=10, - dtype=np.float64, ): rng = np.random.RandomState(global_random_seed) spread = 1000 @@ -439,7 +569,7 @@ def test_pairwise_distances_argkmin( else: dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs) # Taking argkmin (indices of the k smallest values) - argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k] + argkmin_indices_ref = np.argsort(dist_matrix, kind="mergesort", axis=1)[:, :k] # Getting the associated distances argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64) for row_idx in range(argkmin_indices_ref.shape[0]): @@ -459,8 +589,11 @@ def test_pairwise_distances_argkmin( strategy=strategy, ) - ASSERT_RESULT[PairwiseDistancesArgKmin]( - argkmin_distances, argkmin_distances_ref, argkmin_indices, argkmin_indices_ref + ASSERT_RESULT[(PairwiseDistancesArgKmin, dtype)]( + argkmin_distances, + argkmin_distances_ref, + argkmin_indices, + argkmin_indices_ref, ) @@ -526,7 +659,7 @@ def test_pairwise_distances_radius_neighbors( sort_results=True, ) - ASSERT_RESULT[PairwiseDistancesRadiusNeighborhood]( + ASSERT_RESULT[(PairwiseDistancesRadiusNeighborhood, dtype)]( neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref ) @@ -539,13 +672,15 @@ def test_sqeuclidean_row_norms( n_samples, n_features, num_threads, - dtype=np.float64, ): rng = np.random.RandomState(global_random_seed) spread = 100 - X = rng.rand(n_samples, n_features).astype(dtype) * spread + X64 = rng.rand(n_samples, n_features).astype(np.float64) * spread + X32 = X64.astype(np.float32) - sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2 - sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads)) + sq_row_norm_reference = np.linalg.norm(X64, axis=1) ** 2 + sq_row_norm64 = np.asarray(_sqeuclidean_row_norms64(X64, num_threads=num_threads)) + sq_row_norm32 = np.asarray(_sqeuclidean_row_norms32(X32, num_threads=num_threads)) - assert_allclose(sq_row_norm_reference, sq_row_norm) + assert_allclose(sq_row_norm_reference, sq_row_norm64, rtol=1e-7) + assert_allclose(sq_row_norm_reference, sq_row_norm32, rtol=1e-6)