From 792dafcbe993e276df301900bc82e17b2e38b445 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 10:07:27 +0200 Subject: [PATCH 1/8] Remove out-of-date comment --- .../_pairwise_distances_reduction/_dispatcher.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 809d683b52ced..a690919a5fc3e 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -274,11 +274,6 @@ def compute( for the concrete implementation are therefore freed when this classmethod returns. """ - # Note (jjerphan): Some design thoughts for future extensions. - # This factory comes to handle specialisations for the given arguments. - # For future work, this might can be an entrypoint to specialise operations - # for various backend and/or hardware and/or datatypes, and/or fused - # {sparse, dense}-datasetspair etc. if X.dtype == Y.dtype == np.float64: return ArgKmin64.compute( X=X, @@ -425,11 +420,6 @@ def compute( This allows entirely decoupling the API entirely from the implementation details whilst maintaining RAII. """ - # Note (jjerphan): Some design thoughts for future extensions. - # This factory comes to handle specialisations for the given arguments. - # For future work, this might can be an entrypoint to specialise operations - # for various backend and/or hardware and/or datatypes, and/or fused - # {sparse, dense}-datasetspair etc. if X.dtype == Y.dtype == np.float64: return RadiusNeighbors64.compute( X=X, From b7fbde0c0af4a6d5b79d8f64ba45a99a8c15c5c5 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 14:26:10 +0200 Subject: [PATCH 2/8] Reword explanations for unsupported distance metrics --- .../_pairwise_distances_reduction/_dispatcher.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index a690919a5fc3e..b66be17f16bc9 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -61,12 +61,15 @@ class BaseDistanceReductionDispatcher: @classmethod def valid_metrics(cls) -> List[str]: excluded = { - "pyfunc", # is relatively slow because we need to coerce data as np arrays + # PyFunc cannot be supported because it necessitates interacting with + # the CPython interpreter to call user defined functions. + "pyfunc", "mahalanobis", # is numerically unstable - # TODO: In order to support discrete distance metrics, we need to have a - # stable simultaneous sort which preserves the order of the input. - # The best might be using std::stable_sort and a Comparator taking an - # Arrays of Structures instead of Structure of Arrays (currently used). + # In order to support discrete distance metrics, we need to have a + # stable simultaneous sort which preserves the order of the indices + # because there generally is a lot of occurrences for a given values + # of distances in this case. + # TODO: implement a stable simultaneous_sort. "hamming", *BOOL_METRICS, } From 33ad025fdfaead4bf4fd815e2428d9f5dc61b813 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 14:44:15 +0200 Subject: [PATCH 3/8] Remove unused symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you, @MarcoGorelli for working on cython-lint! 🙏 --- .../metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 6 ++---- .../metrics/_pairwise_distances_reduction/_base.pyx.tp | 2 +- .../_gemm_term_computer.pyx.tp | 3 --- .../_radius_neighborhood.pyx.tp | 8 ++------ 4 files changed, 5 insertions(+), 14 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 1c1459e27f210..39f1f2bb81099 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -30,7 +30,7 @@ import warnings from numbers import Integral from scipy.sparse import issparse -from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration +from ...utils import check_scalar, _in_unstable_openblas_configuration from ...utils.fixes import threadpool_limits from ...utils._typedefs import ITYPE, DTYPE @@ -204,7 +204,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): ITYPE_t X_end, ) nogil: cdef: - ITYPE_t idx, jdx + ITYPE_t idx # Sorting the main heaps portion associated to `X[X_start:X_end]` # in ascending order w.r.t the distances. @@ -303,7 +303,6 @@ cdef class ArgKmin{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): cdef void compute_exact_distances(self) nogil: cdef: ITYPE_t i, j - ITYPE_t[:, ::1] Y_indices = self.argkmin_indices DTYPE_t[:, ::1] distances = self.argkmin_distances for i in prange(self.n_samples_X, schedule='static', nogil=True, num_threads=self.effective_n_threads): @@ -448,7 +447,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): cdef void _parallel_on_Y_init( self, ) nogil: - cdef ITYPE_t thread_num ArgKmin{{name_suffix}}._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index d0c06de0f8761..cea680973267c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -31,7 +31,7 @@ from numbers import Integral from sklearn import get_config from sklearn.utils import check_scalar from ...utils._openmp_helpers import _openmp_effective_n_threads -from ...utils._typedefs import ITYPE, DTYPE +from ...utils._typedefs import DTYPE cnp.import_array() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp index 35e57219a96a7..d415b465d57cf 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_gemm_term_computer.pyx.tp @@ -22,7 +22,6 @@ from ...utils._typedefs cimport DTYPE_t, ITYPE_t from ...utils._cython_blas cimport ( BLAS_Order, BLAS_Trans, - ColMajor, NoTrans, RowMajor, Trans, @@ -176,8 +175,6 @@ cdef class GEMMTermComputer{{name_suffix}}: ITYPE_t thread_num, ) nogil: cdef: - ITYPE_t i, j - DTYPE_t squared_dist_i_j const {{INPUT_DTYPE_t}}[:, ::1] X_c = self.X[X_start:X_end, :] const {{INPUT_DTYPE_t}}[:, ::1] Y_c = self.Y[Y_start:Y_end, :] DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index 8d50a6e04abf9..7bbc53c8e5582 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -29,7 +29,7 @@ from ...utils._vector_sentinel cimport vector_to_nd_array from numbers import Real from scipy.sparse import issparse -from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration +from ...utils import check_scalar, _in_unstable_openblas_configuration from ...utils.fixes import threadpool_limits cnp.import_array() @@ -232,9 +232,6 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): ITYPE_t X_start, ITYPE_t X_end, ) nogil: - cdef: - ITYPE_t idx, jdx - # Sorting neighbors for each query vector of X if self.sort_results: for idx in range(X_start, X_end): @@ -294,7 +291,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): self, ) nogil: cdef: - ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current + ITYPE_t idx with nogil, parallel(num_threads=self.effective_n_threads): # Merge vectors used in threads into the main ones. @@ -454,7 +451,6 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix} cdef void _parallel_on_Y_init( self, ) nogil: - cdef ITYPE_t thread_num RadiusNeighbors{{name_suffix}}._parallel_on_Y_init(self) self.gemm_term_computer._parallel_on_Y_init() From 15c2d5b89969f7cc3f48369b385551eb5f069ec2 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 29 Sep 2022 15:02:22 +0200 Subject: [PATCH 4/8] DOC Make dispatchers' docstring uniform --- .../_dispatcher.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index b66be17f16bc9..e3398dc1d28f2 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -244,8 +244,6 @@ def compute( 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' brings more opportunity for parallelism and is therefore more efficient - despite the synchronization step at each iteration of the outer loop - on chunks of `X`. - None (default) looks-up in scikit-learn configuration for `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. @@ -268,9 +266,9 @@ def compute( Notes ----- - This classmethod is responsible for introspecting the arguments - values to dispatch to the most appropriate implementation of - :class:`ArgKmin64`. + This public classmethod is responsible for introspecting the arguments + values to dispatch to the private dtype-specialized implementation of + :class:`ArgKmin`. This allows decoupling the API entirely from the implementation details whilst maintaining RAII: all temporarily allocated datastructures necessary @@ -415,13 +413,12 @@ def compute( ----- This public classmethod is responsible for introspecting the arguments values to dispatch to the private dtype-specialized implementation of - :class:`RadiusNeighbors64`. - - All temporarily allocated datastructures necessary for the concrete - implementation are therefore freed when this classmethod returns. + :class:`RadiusNeighbors`. - This allows entirely decoupling the API entirely from the - implementation details whilst maintaining RAII. + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. """ if X.dtype == Y.dtype == np.float64: return RadiusNeighbors64.compute( From b86bef97d7124ece82a7097bd8f5783f720acb16 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 10 Oct 2022 17:24:24 +0200 Subject: [PATCH 5/8] Import check_array --- sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp | 2 +- .../_pairwise_distances_reduction/_radius_neighborhood.pyx.tp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 39f1f2bb81099..33e9d7256f955 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -30,7 +30,7 @@ import warnings from numbers import Integral from scipy.sparse import issparse -from ...utils import check_scalar, _in_unstable_openblas_configuration +from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration from ...utils.fixes import threadpool_limits from ...utils._typedefs import ITYPE, DTYPE diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index 7bbc53c8e5582..b59e006607238 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -29,7 +29,7 @@ from ...utils._vector_sentinel cimport vector_to_nd_array from numbers import Real from scipy.sparse import issparse -from ...utils import check_scalar, _in_unstable_openblas_configuration +from ...utils import check_array, check_scalar, _in_unstable_openblas_configuration from ...utils.fixes import threadpool_limits cnp.import_array() From f665c0262144fcb1f336099c72c2870c9c577e24 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 11 Oct 2022 17:24:18 +0200 Subject: [PATCH 6/8] Explicitly type idx This was removed inadvertently. Co-authored-by: Thomas J. Fan --- .../_pairwise_distances_reduction/_radius_neighborhood.pyx.tp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp index b59e006607238..423a03a2ae279 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighborhood.pyx.tp @@ -232,6 +232,9 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistanceReducer{{name_suffix}}): ITYPE_t X_start, ITYPE_t X_end, ) nogil: + cdef: + ITYPE_t idx + # Sorting neighbors for each query vector of X if self.sort_results: for idx in range(X_start, X_end): From 12ccb83f603510d5ebf036f31900dedebe1a01e9 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 13 Oct 2022 09:19:51 +0200 Subject: [PATCH 7/8] DOC Do not document dispatchers' methods as public Co-authored-by: Olivier Grisel --- .../_pairwise_distances_reduction/_dispatcher.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index e3398dc1d28f2..d028d7e0b5189 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -266,9 +266,8 @@ def compute( Notes ----- - This public classmethod is responsible for introspecting the arguments - values to dispatch to the private dtype-specialized implementation of - :class:`ArgKmin`. + This classmethod inspects the arguments values to dispatch to the + dtype-specialized implementation of :class:`ArgKmin`. This allows decoupling the API entirely from the implementation details whilst maintaining RAII: all temporarily allocated datastructures necessary @@ -411,9 +410,8 @@ def compute( Notes ----- - This public classmethod is responsible for introspecting the arguments - values to dispatch to the private dtype-specialized implementation of - :class:`RadiusNeighbors`. + This classmethod inspects the arguments values to dispatch to the + dtype-specialized implementation of :class:`RadiusNeighbors`. This allows decoupling the API entirely from the implementation details whilst maintaining RAII: all temporarily allocated datastructures necessary From 66dde63700951259f3f29f5c4b3153d403210f29 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 13 Oct 2022 09:44:18 +0200 Subject: [PATCH 8/8] CI Trigger CI due to a faulty run