diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 43aeeb9eb61b9..03c4d187fd9c2 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -188,7 +188,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): ITYPE_t X_end, ) nogil: cdef: - ITYPE_t idx, jdx + ITYPE_t idx # Sorting the main heaps portion associated to `X[X_start:X_end]` # in ascending order w.r.t the distances. @@ -287,7 +287,6 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): cdef void compute_exact_distances(self) nogil: cdef: ITYPE_t i, j - ITYPE_t[:, ::1] Y_indices = self.argkmin_indices DTYPE_t[:, ::1] distances = self.argkmin_distances for i in prange(self.n_samples_X, schedule='static', nogil=True, num_threads=self.effective_n_threads): @@ -432,7 +431,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): cdef void _parallel_on_Y_init( self, ) nogil: - cdef ITYPE_t thread_num ArgKmin{{name_suffix}}._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index de581fe662b57..d3303511b64d9 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -16,7 +16,7 @@ from numbers import Integral from sklearn import get_config from sklearn.utils import check_scalar from ...utils._openmp_helpers import _openmp_effective_n_threads -from ...utils._typedefs import ITYPE, DTYPE +from ...utils._typedefs import DTYPE cnp.import_array() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 809d683b52ced..d028d7e0b5189 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -61,12 +61,15 @@ class BaseDistanceReductionDispatcher: @classmethod def valid_metrics(cls) -> List[str]: excluded = { - "pyfunc", # is relatively slow because we need to coerce data as np arrays + # PyFunc cannot be supported because it necessitates interacting with + # the CPython interpreter to call user defined functions. + "pyfunc", "mahalanobis", # is numerically unstable - # TODO: In order to support discrete distance metrics, we need to have a - # stable simultaneous sort which preserves the order of the input. - # The best might be using std::stable_sort and a Comparator taking an - # Arrays of Structures instead of Structure of Arrays (currently used). + # In order to support discrete distance metrics, we need to have a + # stable simultaneous sort which preserves the order of the indices + # because there generally is a lot of occurrences for a given values + # of distances in this case. + # TODO: implement a stable simultaneous_sort. "hamming", *BOOL_METRICS, } @@ -241,8 +244,6 @@ def compute( 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity for parallelism and is therefore more efficient - despite the synchronization step at each iteration of the outer loop - on chunks of `X`. - None (default) looks-up in scikit-learn configuration for `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. @@ -265,20 +266,14 @@ def compute( Notes ----- - This classmethod is responsible for introspecting the arguments - values to dispatch to the most appropriate implementation of - :class:`ArgKmin64`. + This classmethod inspects the arguments values to dispatch to the + dtype-specialized implementation of :class:`ArgKmin`. This allows decoupling the API entirely from the implementation details whilst maintaining RAII: all temporarily allocated datastructures necessary for the concrete implementation are therefore freed when this classmethod returns. """ - # Note (jjerphan): Some design thoughts for future extensions. - # This factory comes to handle specialisations for the given arguments. - # For future work, this might can be an entrypoint to specialise operations - # for various backend and/or hardware and/or datatypes, and/or fused - # {sparse, dense}-datasetspair etc. if X.dtype == Y.dtype == np.float64: return ArgKmin64.compute( X=X, @@ -415,21 +410,14 @@ def compute( Notes ----- - This public classmethod is responsible for introspecting the arguments - values to dispatch to the private dtype-specialized implementation of - :class:`RadiusNeighbors64`. + This classmethod inspects the arguments values to dispatch to the + dtype-specialized implementation of :class:`RadiusNeighbors`. - All temporarily allocated datastructures necessary for the concrete - implementation are therefore freed when this classmethod returns. - - This allows entirely decoupling the API entirely from the - implementation details whilst maintaining RAII. + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. """ - # Note (jjerphan): Some design thoughts for future extensions. - # This factory comes to handle specialisations for the given arguments. - # For future work, this might can be an entrypoint to specialise operations - # for various backend and/or hardware and/or datatypes, and/or fused - # {sparse, dense}-datasetspair etc. if X.dtype == Y.dtype == np.float64: return RadiusNeighbors64.compute( X=X, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index cb44012a1c4b0..2646d7b7f2e53 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -22,7 +22,6 @@ from ...utils._typedefs cimport DTYPE_t, ITYPE_t from ...utils._cython_blas cimport ( BLAS_Order, BLAS_Trans, - ColMajor, NoTrans, RowMajor, Trans, @@ -176,8 +175,6 @@ cdef class GEMMTermComputer{{name_suffix}}: ITYPE_t thread_num, ) nogil: cdef: - ITYPE_t i, j - DTYPE_t squared_dist_i_j const {{INPUT_DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :] const {{INPUT_DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :] DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index 6bcf072711abf..e9a16de288299 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -218,7 +218,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): ITYPE_t X_end, ) nogil: cdef: - ITYPE_t idx, jdx + ITYPE_t idx # Sorting neighbors for each query vector of X if self.sort_results: @@ -279,7 +279,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): self, ) nogil: cdef: - ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current + ITYPE_t idx with nogil, parallel(num_threads=self.effective_n_threads): # Merge vectors used in threads into the main ones. @@ -439,7 +439,6 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} cdef void _parallel_on_Y_init( self, ) nogil: - cdef ITYPE_t thread_num RadiusNeighbors{{name_suffix}}._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init()