From 77c284cfdd6c4e102b2083161e3ce8ab2a3dd046 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 15 Dec 2021 15:38:45 +0100 Subject: [PATCH 01/13] Refactor fixtures prior to PairwiseDistancesReduction introduction --- sklearn/neighbors/_binary_tree.pxi | 161 +++------------------- sklearn/neighbors/tests/test_ball_tree.py | 10 ++ sklearn/utils/_heap.pxd | 19 +++ sklearn/utils/_heap.pyx | 138 +++++++++++++++++++ sklearn/utils/_openmp_helpers.pxd | 6 + sklearn/utils/_openmp_helpers.pyx | 15 +- sklearn/utils/setup.py | 7 + 7 files changed, 208 insertions(+), 148 deletions(-) create mode 100644 sklearn/utils/_heap.pxd create mode 100644 sklearn/utils/_heap.pyx create mode 100644 sklearn/utils/_openmp_helpers.pxd diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 6542bc680c58c..2dec9e8148ef5 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -106,7 +106,7 @@ # cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes, ITYPE_t n_features): # """Allocate arrays needed for the KD Tree""" -# cdef int init_node(BinaryTree tree, ITYPE_t i_node, +# cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, ITYPE_t i_node, # ITYPE_t idx_start, ITYPE_t idx_end): # """Initialize the node for the dataset stored in tree.data""" @@ -142,7 +142,6 @@ # BinaryTree tree2, ITYPE_t i_node2): # """Compute the maximum distance between two nodes""" -cimport cython cimport numpy as np from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma from libc.math cimport fmin, fmax @@ -151,21 +150,21 @@ from libc.string cimport memcpy import numpy as np import warnings -from ..utils import check_array - -from sklearn.utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t -from sklearn.utils._typedefs import DTYPE, ITYPE from ..metrics._dist_metrics cimport ( DistanceMetric, euclidean_dist, euclidean_rdist, euclidean_dist_to_rdist, - euclidean_rdist_to_dist, ) from ._partition_nodes cimport partition_node_indices +from ..utils import check_array +from ..utils._typedefs cimport DTYPE_t, ITYPE_t +from ..utils._typedefs import DTYPE, ITYPE +from ..utils._heap cimport simultaneous_sort as _simultaneous_sort, heap_push + cdef extern from "numpy/arrayobject.h": void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) @@ -231,7 +230,7 @@ leaf_size : positive int, default=40 the case that ``n_samples < leaf_size``. metric : str or DistanceMetric object - the distance metric to use for the tree. Default='minkowski' + The distance metric to use for the tree. Default='minkowski' with p=2 (that is, a euclidean metric). See the documentation of the DistanceMetric class for a list of available metrics. {binary_tree}.valid_metrics gives a list of the metrics which @@ -494,27 +493,6 @@ def kernel_norm(h, d, kernel, return_log=False): return np.exp(result) -###################################################################### -# Tree Utility Routines -cdef inline void swap(DITYPE_t* arr, ITYPE_t i1, ITYPE_t i2): - """swap the values at index i1 and i2 of arr""" - cdef DITYPE_t tmp = arr[i1] - arr[i1] = arr[i2] - arr[i2] = tmp - - -cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr, - ITYPE_t i1, ITYPE_t i2) nogil: - """swap the values at inex i1 and i2 of both darr and iarr""" - cdef DTYPE_t dtmp = darr[i1] - darr[i1] = darr[i2] - darr[i2] = dtmp - - cdef ITYPE_t itmp = iarr[i1] - iarr[i1] = iarr[i2] - iarr[i2] = itmp - - cdef class NeighborsHeap: """A max-heap structure to keep track of distances/indices of neighbors @@ -569,52 +547,11 @@ cdef class NeighborsHeap: cdef int _push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val) nogil except -1: """push (val, i_val) into the given row""" - cdef ITYPE_t i, ic1, ic2, i_swap - cdef ITYPE_t size = self.distances.shape[1] - cdef DTYPE_t* dist_arr = &self.distances[row, 0] - cdef ITYPE_t* ind_arr = &self.indices[row, 0] - - # check if val should be in heap - if val >= dist_arr[0]: - return 0 - - # insert val at position zero - dist_arr[0] = val - ind_arr[0] = i_val - - # descend the heap, swapping values until the max heap criterion is met - i = 0 - while True: - ic1 = 2 * i + 1 - ic2 = ic1 + 1 - - if ic1 >= size: - break - elif ic2 >= size: - if dist_arr[ic1] > val: - i_swap = ic1 - else: - break - elif dist_arr[ic1] >= dist_arr[ic2]: - if val < dist_arr[ic1]: - i_swap = ic1 - else: - break - else: - if val < dist_arr[ic2]: - i_swap = ic2 - else: - break - - dist_arr[i] = dist_arr[i_swap] - ind_arr[i] = ind_arr[i_swap] - - i = i_swap - - dist_arr[i] = val - ind_arr[i] = i_val - - return 0 + cdef: + ITYPE_t size = self.distances.shape[1] + DTYPE_t* dist_arr = &self.distances[row, 0] + ITYPE_t* ind_arr = &self.indices[row, 0] + return heap_push(dist_arr, ind_arr, size, val, i_val) cdef int _sort(self) except -1: """simultaneously sort the distances and indices""" @@ -627,68 +564,6 @@ cdef class NeighborsHeap: distances.shape[1]) return 0 - -cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx, - ITYPE_t size) nogil except -1: - """ - Perform a recursive quicksort on the dist array, simultaneously - performing the same swaps on the idx array. The equivalent in - numpy (though quite a bit slower) is - - def simultaneous_sort(dist, idx): - i = np.argsort(dist) - return dist[i], idx[i] - """ - cdef ITYPE_t pivot_idx, i, store_idx - cdef DTYPE_t pivot_val - - # in the small-array case, do things efficiently - if size <= 1: - pass - elif size == 2: - if dist[0] > dist[1]: - dual_swap(dist, idx, 0, 1) - elif size == 3: - if dist[0] > dist[1]: - dual_swap(dist, idx, 0, 1) - if dist[1] > dist[2]: - dual_swap(dist, idx, 1, 2) - if dist[0] > dist[1]: - dual_swap(dist, idx, 0, 1) - else: - # Determine the pivot using the median-of-three rule. - # The smallest of the three is moved to the beginning of the array, - # the middle (the pivot value) is moved to the end, and the largest - # is moved to the pivot index. - pivot_idx = size / 2 - if dist[0] > dist[size - 1]: - dual_swap(dist, idx, 0, size - 1) - if dist[size - 1] > dist[pivot_idx]: - dual_swap(dist, idx, size - 1, pivot_idx) - if dist[0] > dist[size - 1]: - dual_swap(dist, idx, 0, size - 1) - pivot_val = dist[size - 1] - - # partition indices about pivot. At the end of this operation, - # pivot_idx will contain the pivot value, everything to the left - # will be smaller, and everything to the right will be larger. - store_idx = 0 - for i in range(size - 1): - if dist[i] < pivot_val: - dual_swap(dist, idx, i, store_idx) - store_idx += 1 - dual_swap(dist, idx, store_idx, size - 1) - pivot_idx = store_idx - - # recursively sort each side of the pivot - if pivot_idx > 1: - _simultaneous_sort(dist, idx, pivot_idx) - if pivot_idx + 2 < size: - _simultaneous_sort(dist + pivot_idx + 1, - idx + pivot_idx + 1, - size - pivot_idx - 1) - return 0 - #------------------------------------------------------------ # find_node_split_dim: # this computes the equivalent of @@ -883,7 +758,7 @@ def newObj(obj): ###################################################################### # define the reverse mapping of VALID_METRICS -from sklearn.metrics._dist_metrics import get_valid_metric_ids +from ..metrics._dist_metrics import get_valid_metric_ids VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS) @@ -900,13 +775,9 @@ cdef class BinaryTree: cdef readonly const DTYPE_t[:, ::1] data cdef readonly const DTYPE_t[::1] sample_weight cdef public DTYPE_t sum_weight - - # Even if those memoryviews attributes are const-qualified, - # they get modified via their numpy counterpart. - # For instance, `node_data` gets modified via `node_data_arr`. - cdef public const ITYPE_t[::1] idx_array - cdef public const NodeData_t[::1] node_data - cdef public const DTYPE_t[:, :, ::1] node_bounds + cdef public ITYPE_t[::1] idx_array + cdef public NodeData_t[::1] node_data + cdef public DTYPE_t[:, :, ::1] node_bounds cdef ITYPE_t leaf_size cdef ITYPE_t n_levels diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 41ccff25a260e..a823a03251a1b 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -85,3 +85,13 @@ def test_array_object_type(): X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): BallTree(X) + + +def test_bad_pyfunc_metric(): + def wrong_distance(x, y): + return "1" + + X = np.ones((5, 2)) + msg = "Custom distance function must accept two vectors" + with pytest.raises(TypeError, match=msg): + BallTree(X, metric=wrong_distance) diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd new file mode 100644 index 0000000000000..0b65a5a32e393 --- /dev/null +++ b/sklearn/utils/_heap.pxd @@ -0,0 +1,19 @@ +# Heap routines, used in various Cython implementation. + +from cython cimport floating + +from ._typedefs cimport ITYPE_t + +cdef int simultaneous_sort( + floating* dist, + ITYPE_t* idx, + ITYPE_t size +) nogil + +cdef int heap_push( + floating* values, + ITYPE_t* indices, + ITYPE_t size, + floating val, + ITYPE_t val_idx, +) nogil diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx new file mode 100644 index 0000000000000..89fc779877c4f --- /dev/null +++ b/sklearn/utils/_heap.pyx @@ -0,0 +1,138 @@ +from cython cimport floating, integral, numeric + +from ._typedefs cimport ITYPE_t + +cdef inline void dual_swap(floating* darr, ITYPE_t* iarr, + ITYPE_t a, ITYPE_t b) nogil: + """Swap the values at index i1 and i2 of both darr and iarr""" + cdef floating dtmp = darr[a] + darr[a] = darr[b] + darr[b] = dtmp + + cdef ITYPE_t itmp = iarr[a] + iarr[a] = iarr[b] + iarr[b] = itmp + +cdef int simultaneous_sort( + floating* values, + ITYPE_t* indices, + ITYPE_t size +) nogil: + """ + Perform a recursive quicksort on the values array, simultaneously + performing the same swaps on the indices array. + """ + # TODO: In order to support discrete distance metrics, we need to have a + # simultaneous sort which breaks ties on indices when distances are identical. + # The best might be using a std::stable_sort and a Comparator which might need + # an Array of Structures (AoS) instead of the Structure of Arrays (SoA) + # currently used. + cdef: + ITYPE_t pivot_idx, i, store_idx + floating pivot_val + + # in the small-array case, do things efficiently + if size <= 1: + pass + elif size == 2: + if values[0] > values[1]: + dual_swap(values, indices, 0, 1) + elif size == 3: + if values[0] > values[1]: + dual_swap(values, indices, 0, 1) + if values[1] > values[2]: + dual_swap(values, indices, 1, 2) + if values[0] > values[1]: + dual_swap(values, indices, 0, 1) + else: + # Determine the pivot using the median-of-three rule. + # The smallest of the three is moved to the beginning of the array, + # the middle (the pivot value) is moved to the end, and the largest + # is moved to the pivot index. + pivot_idx = size // 2 + if values[0] > values[size - 1]: + dual_swap(values, indices, 0, size - 1) + if values[size - 1] > values[pivot_idx]: + dual_swap(values, indices, size - 1, pivot_idx) + if values[0] > values[size - 1]: + dual_swap(values, indices, 0, size - 1) + pivot_val = values[size - 1] + + # partition indices about pivot. At the end of this operation, + # pivot_idx will contain the pivot value, everything to the left + # will be smaller, and everything to the right will be larger. + store_idx = 0 + for i in range(size - 1): + if values[i] < pivot_val: + dual_swap(values, indices, i, store_idx) + store_idx += 1 + dual_swap(values, indices, store_idx, size - 1) + pivot_idx = store_idx + + # recursively sort each side of the pivot + if pivot_idx > 1: + simultaneous_sort(values, indices, pivot_idx) + if pivot_idx + 2 < size: + simultaneous_sort(values + pivot_idx + 1, + indices + pivot_idx + 1, + size - pivot_idx - 1) + return 0 + + +cdef inline int heap_push( + floating* values, + ITYPE_t* indices, + ITYPE_t size, + floating val, + ITYPE_t val_idx, +) nogil: + """Push a tuple (val, val_idx) into a fixed-size max-heap. + + The max-heap is represented as a struct of arrays where: + - values is the array containing the data to construct the heap on + - indices is the array containing the indices (meta-data) of each value. + """ + cdef: + ITYPE_t current_idx, left_child_idx, right_child_idx, swap_idx + + # check if val should be in heap + if val >= values[0]: + return 0 + + # insert val at position zero + values[0] = val + indices[0] = val_idx + + # descend the heap, swapping values until the max heap criterion is met + current_idx = 0 + while True: + left_child_idx = 2 * current_idx + 1 + right_child_idx = left_child_idx + 1 + + if left_child_idx >= size: + break + elif right_child_idx >= size: + if values[left_child_idx] > val: + swap_idx = left_child_idx + else: + break + elif values[left_child_idx] >= values[right_child_idx]: + if val < values[left_child_idx]: + swap_idx = left_child_idx + else: + break + else: + if val < values[right_child_idx]: + swap_idx = right_child_idx + else: + break + + values[current_idx] = values[swap_idx] + indices[current_idx] = indices[swap_idx] + + current_idx = swap_idx + + values[current_idx] = val + indices[current_idx] = val_idx + + return 0 diff --git a/sklearn/utils/_openmp_helpers.pxd b/sklearn/utils/_openmp_helpers.pxd new file mode 100644 index 0000000000000..e57fc9bfa6bf5 --- /dev/null +++ b/sklearn/utils/_openmp_helpers.pxd @@ -0,0 +1,6 @@ +# Helpers to access OpenMP threads information +# +# Those interfaces act as indirections which allows the non-support of OpenMP +# for implementations which have been written for it. + +cdef int _openmp_thread_num() nogil diff --git a/sklearn/utils/_openmp_helpers.pyx b/sklearn/utils/_openmp_helpers.pyx index fb8920074a84e..cddd77ac42746 100644 --- a/sklearn/utils/_openmp_helpers.pyx +++ b/sklearn/utils/_openmp_helpers.pyx @@ -6,7 +6,7 @@ IF SKLEARN_OPENMP_PARALLELISM_ENABLED: def _openmp_parallelism_enabled(): """Determines whether scikit-learn has been built with OpenMP - + It allows to retrieve at runtime the information gathered at compile time. """ # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time during @@ -22,7 +22,7 @@ cpdef _openmp_effective_n_threads(n_threads=None): - if the ``OMP_NUM_THREADS`` environment variable is set, return ``openmp.omp_get_max_threads()`` - otherwise, return the minimum between ``openmp.omp_get_max_threads()`` - and the number of cpus, taking cgroups quotas into account. Cgroups + and the number of cpus, taking cgroups quotas into account. Cgroups quotas can typically be set by tools such as Docker. The result of ``omp_get_max_threads`` can be influenced by environment variable ``OMP_NUM_THREADS`` or at runtime by ``omp_set_num_threads``. @@ -59,4 +59,13 @@ cpdef _openmp_effective_n_threads(n_threads=None): # OpenMP disabled at build-time => sequential mode return 1 - + +cdef inline int _openmp_thread_num() nogil: + """Return the number of the thread calling this function. + + If scikit-learn is built without OpenMP support, always return 0. + """ + IF SKLEARN_OPENMP_PARALLELISM_ENABLED: + return openmp.omp_get_thread_num() + ELSE: + return 0 diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index ed78ecc5db76f..04d446e9b55aa 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -95,6 +95,13 @@ def configuration(parent_package="", top_path=None): libraries=libraries, ) + config.add_extension( + "_heap", + sources=["_heap.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + config.add_subpackage("tests") return config From 15b7d854b13fe9dc313a089474c128607bd3faa3 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 15 Dec 2021 15:38:45 +0100 Subject: [PATCH 02/13] fixup! Revert some changes Co-authored-by: Thomas J. Fan --- sklearn/neighbors/_binary_tree.pxi | 14 +++++++++----- sklearn/utils/_heap.pyx | 7 +++++++ sklearn/utils/_openmp_helpers.pxd | 6 ------ sklearn/utils/_openmp_helpers.pyx | 15 +++------------ 4 files changed, 19 insertions(+), 23 deletions(-) delete mode 100644 sklearn/utils/_openmp_helpers.pxd diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 2dec9e8148ef5..b3f02cd74d7b5 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -106,7 +106,7 @@ # cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes, ITYPE_t n_features): # """Allocate arrays needed for the KD Tree""" -# cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, ITYPE_t i_node, +# cdef int init_node(BinaryTree tree, ITYPE_t i_node, # ITYPE_t idx_start, ITYPE_t idx_end): # """Initialize the node for the dataset stored in tree.data""" @@ -758,7 +758,7 @@ def newObj(obj): ###################################################################### # define the reverse mapping of VALID_METRICS -from ..metrics._dist_metrics import get_valid_metric_ids +from sklearn.metrics._dist_metrics import get_valid_metric_ids VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS) @@ -775,9 +775,13 @@ cdef class BinaryTree: cdef readonly const DTYPE_t[:, ::1] data cdef readonly const DTYPE_t[::1] sample_weight cdef public DTYPE_t sum_weight - cdef public ITYPE_t[::1] idx_array - cdef public NodeData_t[::1] node_data - cdef public DTYPE_t[:, :, ::1] node_bounds + + # Even if those memoryviews attributes are const-qualified, + # they get modified via their numpy counterpart. + # For instance, `node_data` gets modified via `node_data_arr`. + cdef public const ITYPE_t[::1] idx_array + cdef public const NodeData_t[::1] node_data + cdef public const DTYPE_t[:, :, ::1] node_bounds cdef ITYPE_t leaf_size cdef ITYPE_t n_levels diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx index 89fc779877c4f..86de96568e0b2 100644 --- a/sklearn/utils/_heap.pyx +++ b/sklearn/utils/_heap.pyx @@ -21,6 +21,13 @@ cdef int simultaneous_sort( """ Perform a recursive quicksort on the values array, simultaneously performing the same swaps on the indices array. + + The numpy equivalent is: + + def simultaneous_sort(dist, idx): + i = np.argsort(dist) + return dist[i], idx[i] + """ # TODO: In order to support discrete distance metrics, we need to have a # simultaneous sort which breaks ties on indices when distances are identical. diff --git a/sklearn/utils/_openmp_helpers.pxd b/sklearn/utils/_openmp_helpers.pxd deleted file mode 100644 index e57fc9bfa6bf5..0000000000000 --- a/sklearn/utils/_openmp_helpers.pxd +++ /dev/null @@ -1,6 +0,0 @@ -# Helpers to access OpenMP threads information -# -# Those interfaces act as indirections which allows the non-support of OpenMP -# for implementations which have been written for it. - -cdef int _openmp_thread_num() nogil diff --git a/sklearn/utils/_openmp_helpers.pyx b/sklearn/utils/_openmp_helpers.pyx index cddd77ac42746..fb8920074a84e 100644 --- a/sklearn/utils/_openmp_helpers.pyx +++ b/sklearn/utils/_openmp_helpers.pyx @@ -6,7 +6,7 @@ IF SKLEARN_OPENMP_PARALLELISM_ENABLED: def _openmp_parallelism_enabled(): """Determines whether scikit-learn has been built with OpenMP - + It allows to retrieve at runtime the information gathered at compile time. """ # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time during @@ -22,7 +22,7 @@ cpdef _openmp_effective_n_threads(n_threads=None): - if the ``OMP_NUM_THREADS`` environment variable is set, return ``openmp.omp_get_max_threads()`` - otherwise, return the minimum between ``openmp.omp_get_max_threads()`` - and the number of cpus, taking cgroups quotas into account. Cgroups + and the number of cpus, taking cgroups quotas into account. Cgroups quotas can typically be set by tools such as Docker. The result of ``omp_get_max_threads`` can be influenced by environment variable ``OMP_NUM_THREADS`` or at runtime by ``omp_set_num_threads``. @@ -59,13 +59,4 @@ cpdef _openmp_effective_n_threads(n_threads=None): # OpenMP disabled at build-time => sequential mode return 1 - -cdef inline int _openmp_thread_num() nogil: - """Return the number of the thread calling this function. - - If scikit-learn is built without OpenMP support, always return 0. - """ - IF SKLEARN_OPENMP_PARALLELISM_ENABLED: - return openmp.omp_get_thread_num() - ELSE: - return 0 + From e36900836a1de064a4cf686b2912cc8623481b86 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 16 Dec 2021 08:00:26 +0100 Subject: [PATCH 03/13] DOC Correct and improve comments Co-authored-by: Christian Lorentzen --- sklearn/utils/_heap.pxd | 2 +- sklearn/utils/_heap.pyx | 33 +++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd index 0b65a5a32e393..c760d900a1c3b 100644 --- a/sklearn/utils/_heap.pxd +++ b/sklearn/utils/_heap.pxd @@ -1,4 +1,4 @@ -# Heap routines, used in various Cython implementation. +# Heap routines, used in various Cython implementations. from cython cimport floating diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx index 86de96568e0b2..8f1368c0878a6 100644 --- a/sklearn/utils/_heap.pyx +++ b/sklearn/utils/_heap.pyx @@ -4,7 +4,7 @@ from ._typedefs cimport ITYPE_t cdef inline void dual_swap(floating* darr, ITYPE_t* iarr, ITYPE_t a, ITYPE_t b) nogil: - """Swap the values at index i1 and i2 of both darr and iarr""" + """Swap the values at index a and b of both darr and iarr""" cdef floating dtmp = darr[a] darr[a] = darr[b] darr[b] = dtmp @@ -19,8 +19,8 @@ cdef int simultaneous_sort( ITYPE_t size ) nogil: """ - Perform a recursive quicksort on the values array, simultaneously - performing the same swaps on the indices array. + Perform a recursive quicksort on the values array as to sort them ascendingly. + This simultaneously perform the swaps on both the values and the indices arrays. The numpy equivalent is: @@ -28,6 +28,10 @@ cdef int simultaneous_sort( i = np.argsort(dist) return dist[i], idx[i] + Notes + ----- + Arrays are manipulated via a pointer to there first element and their size + as to ease the processing of dynamically allocated buffers. """ # TODO: In order to support discrete distance metrics, we need to have a # simultaneous sort which breaks ties on indices when distances are identical. @@ -65,7 +69,7 @@ cdef int simultaneous_sort( dual_swap(values, indices, 0, size - 1) pivot_val = values[size - 1] - # partition indices about pivot. At the end of this operation, + # Partition indices about pivot. At the end of this operation, # pivot_idx will contain the pivot value, everything to the left # will be smaller, and everything to the right will be larger. store_idx = 0 @@ -76,7 +80,7 @@ cdef int simultaneous_sort( dual_swap(values, indices, store_idx, size - 1) pivot_idx = store_idx - # recursively sort each side of the pivot + # Recursively sort each side of the pivot if pivot_idx > 1: simultaneous_sort(values, indices, pivot_idx) if pivot_idx + 2 < size: @@ -93,24 +97,29 @@ cdef inline int heap_push( floating val, ITYPE_t val_idx, ) nogil: - """Push a tuple (val, val_idx) into a fixed-size max-heap. + """Push a tuple (val, val_idx) onto a fixed-size max-heap. - The max-heap is represented as a struct of arrays where: - - values is the array containing the data to construct the heap on - - indices is the array containing the indices (meta-data) of each value. + The max-heap is represented as a Structure of Arrays where: + - values is the array containing the data to construct then heap with + - indices is the array containing the indices (meta-data) of each value + + Notes + ----- + Arrays are manipulated via a pointer to there first element and their size + as to ease the processing of dynamically allocated buffers. """ cdef: ITYPE_t current_idx, left_child_idx, right_child_idx, swap_idx - # check if val should be in heap + # Check if val should be in heap if val >= values[0]: return 0 - # insert val at position zero + # Insert val at position zero values[0] = val indices[0] = val_idx - # descend the heap, swapping values until the max heap criterion is met + # Descend the heap, swapping values until the max heap criterion is met current_idx = 0 while True: left_child_idx = 2 * current_idx + 1 From 5429ad74d9191c74c816ce1ae63c35867febf752 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 16 Dec 2021 20:24:19 +0100 Subject: [PATCH 04/13] DOC Given a simple example to explain heap_push Also fix typo. Co-authored-by: Olivier Grisel --- sklearn/utils/_heap.pyx | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx index 8f1368c0878a6..d56740676966d 100644 --- a/sklearn/utils/_heap.pyx +++ b/sklearn/utils/_heap.pyx @@ -100,13 +100,29 @@ cdef inline int heap_push( """Push a tuple (val, val_idx) onto a fixed-size max-heap. The max-heap is represented as a Structure of Arrays where: - - values is the array containing the data to construct then heap with + - values is the array containing the data to construct the heap with - indices is the array containing the indices (meta-data) of each value Notes ----- Arrays are manipulated via a pointer to there first element and their size as to ease the processing of dynamically allocated buffers. + + For instance, in pseudo-code: + + heap_push( + values=[0.1, 0.4, 1.2], + indices=[42, 1, 5], + size=3, + val=0.2, + val_idx=4, + ) + + will modify values and indices inplace, giving at the end of the call: + + values == [0.1, 0.2, 0.4] + indices == [42, 4, 1] + """ cdef: ITYPE_t current_idx, left_child_idx, right_child_idx, swap_idx From 2a6c2d8c8ab8fbfb05b37aa6b93cbf2effcaa30f Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Dec 2021 08:32:37 +0100 Subject: [PATCH 05/13] TST Correct wrong_distance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- sklearn/neighbors/tests/test_ball_tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index a823a03251a1b..52c44ab998086 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -88,7 +88,7 @@ def test_array_object_type(): def test_bad_pyfunc_metric(): - def wrong_distance(x, y): + def wrong_distance(x): return "1" X = np.ones((5, 2)) From 94dd141f89fb4b85cab020e95371f8dd44834a71 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Dec 2021 08:33:59 +0100 Subject: [PATCH 06/13] DOC Correct simple example for heap_push Co-authored-by: Thomas J. Fan --- sklearn/utils/_heap.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx index d56740676966d..cbcff56457443 100644 --- a/sklearn/utils/_heap.pyx +++ b/sklearn/utils/_heap.pyx @@ -110,9 +110,11 @@ cdef inline int heap_push( For instance, in pseudo-code: + values = [1.2, 0.4, 0.1], + indices = [42, 1, 5], heap_push( - values=[0.1, 0.4, 1.2], - indices=[42, 1, 5], + values=values, + indices=indices, size=3, val=0.2, val_idx=4, @@ -120,8 +122,8 @@ cdef inline int heap_push( will modify values and indices inplace, giving at the end of the call: - values == [0.1, 0.2, 0.4] - indices == [42, 4, 1] + values == [0.4, 0.2, 0.1] + indices == [1, 4, 5] """ cdef: From 529ee275ef4705119f352251408a498cd2463799 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Dec 2021 08:36:29 +0100 Subject: [PATCH 07/13] MAINT Remove numpy include for heaps utilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- sklearn/utils/setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index 04d446e9b55aa..e3ceab6c52bbf 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -98,7 +98,6 @@ def configuration(parent_package="", top_path=None): config.add_extension( "_heap", sources=["_heap.pyx"], - include_dirs=[numpy.get_include()], libraries=libraries, ) From cb32715702f1f0997084a1332d93de126470ad42 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Dec 2021 10:51:47 +0100 Subject: [PATCH 08/13] fixup! TST Correct wrong_distance --- sklearn/neighbors/tests/test_ball_tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 52c44ab998086..b051cb8b47de0 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -88,10 +88,10 @@ def test_array_object_type(): def test_bad_pyfunc_metric(): - def wrong_distance(x): + def wrong_distance(x, y): return "1" X = np.ones((5, 2)) - msg = "Custom distance function must accept two vectors" + msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): BallTree(X, metric=wrong_distance) From c312ead4012424a7f0666ac453e8bb5deab5597c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Dec 2021 10:51:47 +0100 Subject: [PATCH 09/13] fixup! TST Correct wrong_distance --- sklearn/neighbors/tests/test_ball_tree.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index b051cb8b47de0..414e780e4c601 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -88,10 +88,17 @@ def test_array_object_type(): def test_bad_pyfunc_metric(): - def wrong_distance(x, y): + def wrong_returned_value(x, y): return "1" + def one_arg_func(x): + return 1.0 + X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): - BallTree(X, metric=wrong_distance) + BallTree(X, metric=wrong_returned_value) + + msg = "one_arg_func\(\) takes 1 positional argument but 2 were given" + with pytest.raises(TypeError, match=msg): + BallTree(X, metric=one_arg_func) From da37426796bb17a0c1dbbaadc81a1c5c5df4ef50 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 17 Dec 2021 10:51:47 +0100 Subject: [PATCH 10/13] fixup! TST Correct wrong_distance --- sklearn/neighbors/tests/test_ball_tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 414e780e4c601..cf00fabf3dc5d 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -99,6 +99,6 @@ def one_arg_func(x): with pytest.raises(TypeError, match=msg): BallTree(X, metric=wrong_returned_value) - msg = "one_arg_func\(\) takes 1 positional argument but 2 were given" + msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): BallTree(X, metric=one_arg_func) From bd6b568c1645b399aeaebf9061c2c30c936e9c67 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 21 Dec 2021 08:47:58 +0100 Subject: [PATCH 11/13] Reword, ultimately Co-authored-by: Christian Lorentzen --- sklearn/utils/_heap.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx index cbcff56457443..03ab33d6e065b 100644 --- a/sklearn/utils/_heap.pyx +++ b/sklearn/utils/_heap.pyx @@ -20,7 +20,7 @@ cdef int simultaneous_sort( ) nogil: """ Perform a recursive quicksort on the values array as to sort them ascendingly. - This simultaneously perform the swaps on both the values and the indices arrays. + This simultaneously performs the swaps on both the values and the indices arrays. The numpy equivalent is: From 5889c6fa9b75a4dbb0ba6fb18e0c05f4e7b7c3b6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 21 Dec 2021 09:19:38 +0100 Subject: [PATCH 12/13] Trigger CI As to see if there's a glitch on the CI codecov/patch. From fec284189cf3e8869eba3604bcdb1e7ad81c8def Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 21 Dec 2021 09:45:05 +0100 Subject: [PATCH 13/13] Do not cover unexecuted line of a wrongly specified function Co-authored-by: Christian Lorentzen --- sklearn/neighbors/tests/test_ball_tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index cf00fabf3dc5d..d5046afd2da2a 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -92,7 +92,7 @@ def wrong_returned_value(x, y): return "1" def one_arg_func(x): - return 1.0 + return 1.0 # pragma: no cover X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float."