diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 7edc64c59a050..f2d122c6f1826 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -4,7 +4,7 @@ from cython cimport final from cython.parallel cimport parallel, prange from ...utils._heap cimport heap_push -from ...utils._sorting cimport simultaneous_sort +from ...utils._sorting cimport simultaneous_quicksort from ...utils._typedefs cimport intp_t, float64_t import numpy as np @@ -184,7 +184,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main heaps portion associated to `X[X_start:X_end]` # in ascending order w.r.t the distances. for idx in range(X_end - X_start): - simultaneous_sort( + simultaneous_quicksort( self.heaps_r_distances_chunks[thread_num] + idx * self.k, self.heaps_indices_chunks[thread_num] + idx * self.k, self.k @@ -268,7 +268,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): # Sorting the main in ascending order w.r.t the distances. # This is done in parallel sample-wise (no need for locks). for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( + simultaneous_quicksort( &self.argkmin_distances[idx, 0], &self.argkmin_indices[idx, 0], self.k, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 1defa30b6325e..2d0f8819c4998 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -8,7 +8,7 @@ from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange -from ...utils._sorting cimport simultaneous_sort +from ...utils._sorting cimport simultaneous_quicksort from ...utils._typedefs cimport intp_t, float64_t from ...utils._vector_sentinel cimport vector_to_nd_array @@ -218,7 +218,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sorting neighbors for each query vector of X if self.sort_results: for idx in range(X_start, X_end): - simultaneous_sort( + simultaneous_quicksort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() @@ -289,7 +289,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Sort in parallel in ascending order w.r.t the distances if requested. if self.sort_results: for idx in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( + simultaneous_quicksort( deref(self.neigh_distances)[idx].data(), deref(self.neigh_indices)[idx].data(), deref(self.neigh_indices)[idx].size() diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index a03b7cdec2294..7078ea629a4d1 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -163,7 +163,7 @@ from ._partition_nodes cimport partition_node_indices from ..utils import check_array from ..utils._typedefs cimport float64_t, intp_t from ..utils._heap cimport heap_push -from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort +from ..utils._sorting cimport simultaneous_quicksort cnp.import_array() @@ -561,9 +561,9 @@ cdef class NeighborsHeap: """simultaneously sort the distances and indices""" cdef intp_t row for row in range(self.distances.shape[0]): - _simultaneous_sort( - dist=&self.distances[row, 0], - idx=&self.indices[row, 0], + simultaneous_quicksort( + values=&self.distances[row, 0], + indices=&self.indices[row, 0], size=self.distances.shape[1], ) return 0 @@ -1305,8 +1305,11 @@ cdef class BinaryTree: continue if sort_results: - _simultaneous_sort(&dist_arr_i[0], &idx_arr_i[0], - counts[i]) + simultaneous_quicksort( + &dist_arr_i[0], + &idx_arr_i[0], + counts[i], + ) # equivalent to: indices[i] = np_idx_arr[:counts[i]].copy() indices[i] = malloc(counts[i] * sizeof(intp_t)) @@ -2388,15 +2391,17 @@ def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): """In-place simultaneous sort the given row of the arrays This python wrapper exists primarily to enable unit testing - of the _simultaneous_sort C routine. + of the simultaneous_quicksort C routine. """ assert distances.shape[0] == indices.shape[0] assert distances.shape[1] == indices.shape[1] cdef intp_t row for row in range(distances.shape[0]): - _simultaneous_sort(&distances[row, 0], - &indices[row, 0], - distances.shape[1]) + simultaneous_quicksort( + &distances[row, 0], + &indices[row, 0], + distances.shape[1], + ) def nodeheap_sort(float64_t[::1] vals): diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index d485e799eb5b0..72648ce937b97 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -11,13 +11,11 @@ BallTree, kernel_norm, NeighborsHeap as NeighborsHeapBT, - simultaneous_sort as simultaneous_sort_bt, nodeheap_sort as nodeheap_sort_bt, ) from sklearn.neighbors._kd_tree import ( KDTree, NeighborsHeap as NeighborsHeapKDT, - simultaneous_sort as simultaneous_sort_kdt, nodeheap_sort as nodeheap_sort_kdt, ) @@ -188,30 +186,6 @@ def test_node_heap(nodeheap_sort, n_nodes=50): assert_array_almost_equal(vals[i1], vals2) -@pytest.mark.parametrize( - "simultaneous_sort", [simultaneous_sort_bt, simultaneous_sort_kdt] -) -def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201): - rng = check_random_state(0) - dist = rng.random_sample((n_rows, n_pts)).astype(np.float64, copy=False) - ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(np.intp, copy=False) - - dist2 = dist.copy() - ind2 = ind.copy() - - # simultaneous sort rows using function - simultaneous_sort(dist, ind) - - # simultaneous sort rows using numpy - i = np.argsort(dist2, axis=1) - row_ind = np.arange(n_rows)[:, None] - dist2 = dist2[row_ind, i] - ind2 = ind2[row_ind, i] - - assert_array_almost_equal(dist, dist2) - assert_array_almost_equal(ind, ind2) - - @pytest.mark.parametrize("Cls", [KDTree, BallTree]) def test_gaussian_kde(Cls, n_samples=1000): # Compare gaussian KDE results to scipy.stats.gaussian_kde diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 83a80d90cc1b9..9c1d8d369e5ce 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -21,6 +21,10 @@ import numpy as np from scipy.sparse import csc_matrix +from ..utils._sorting cimport simultaneous_introsort as sort +# TODO: when Cython>=3.0 is used, remove the casts in call to sort. +from ..utils._typedefs cimport intp_t + from ._utils cimport log from ._utils cimport rand_int from ._utils cimport rand_uniform @@ -413,119 +417,6 @@ cdef inline int node_split_best( return 0 -# Sort n-element arrays pointed to by feature_values and samples, simultaneously, -# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997). -cdef inline void sort(DTYPE_t* feature_values, SIZE_t* samples, SIZE_t n) noexcept nogil: - if n == 0: - return - cdef int maxd = 2 * log(n) - introsort(feature_values, samples, n, maxd) - - -cdef inline void swap(DTYPE_t* feature_values, SIZE_t* samples, - SIZE_t i, SIZE_t j) noexcept nogil: - # Helper for sort - feature_values[i], feature_values[j] = feature_values[j], feature_values[i] - samples[i], samples[j] = samples[j], samples[i] - - -cdef inline DTYPE_t median3(DTYPE_t* feature_values, SIZE_t n) noexcept nogil: - # Median of three pivot selection, after Bentley and McIlroy (1993). - # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. - cdef DTYPE_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1] - if a < b: - if b < c: - return b - elif a < c: - return c - else: - return a - elif b < c: - if a < c: - return a - else: - return c - else: - return b - - -# Introsort with median of 3 pivot selection and 3-way partition function -# (robust to repeated elements, e.g. lots of zero features). -cdef void introsort(DTYPE_t* feature_values, SIZE_t *samples, - SIZE_t n, int maxd) noexcept nogil: - cdef DTYPE_t pivot - cdef SIZE_t i, l, r - - while n > 1: - if maxd <= 0: # max depth limit exceeded ("gone quadratic") - heapsort(feature_values, samples, n) - return - maxd -= 1 - - pivot = median3(feature_values, n) - - # Three-way partition. - i = l = 0 - r = n - while i < r: - if feature_values[i] < pivot: - swap(feature_values, samples, i, l) - i += 1 - l += 1 - elif feature_values[i] > pivot: - r -= 1 - swap(feature_values, samples, i, r) - else: - i += 1 - - introsort(feature_values, samples, l, maxd) - feature_values += r - samples += r - n -= r - - -cdef inline void sift_down(DTYPE_t* feature_values, SIZE_t* samples, - SIZE_t start, SIZE_t end) noexcept nogil: - # Restore heap order in feature_values[start:end] by moving the max element to start. - cdef SIZE_t child, maxind, root - - root = start - while True: - child = root * 2 + 1 - - # find max of root, left child, right child - maxind = root - if child < end and feature_values[maxind] < feature_values[child]: - maxind = child - if child + 1 < end and feature_values[maxind] < feature_values[child + 1]: - maxind = child + 1 - - if maxind == root: - break - else: - swap(feature_values, samples, root, maxind) - root = maxind - - -cdef void heapsort(DTYPE_t* feature_values, SIZE_t* samples, SIZE_t n) noexcept nogil: - cdef SIZE_t start, end - - # heapify - start = (n - 2) / 2 - end = n - while True: - sift_down(feature_values, samples, start, end) - if start == 0: - break - start -= 1 - - # sort by shrinking the heap, putting the max element immediately after it - end = n - 1 - while end > 0: - swap(feature_values, samples, 0, end) - sift_down(feature_values, samples, 0, end) - end = end - 1 - cdef inline int node_split_random( Splitter splitter, Partitioner partitioner, @@ -742,7 +633,7 @@ cdef class DensePartitioner: # effectively. for i in range(self.start, self.end): feature_values[i] = X[samples[i], current_feature] - sort(&feature_values[self.start], &samples[self.start], self.end - self.start) + sort(&feature_values[self.start], &samples[self.start], self.end - self.start) cdef inline void find_min_max( self, @@ -901,9 +792,9 @@ cdef class SparsePartitioner: self.extract_nnz(current_feature) # Sort the positive and negative parts of `feature_values` - sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start) + sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start) if self.start_positive < self.end: - sort(&feature_values[self.start_positive], &samples[self.start_positive], + sort(&feature_values[self.start_positive], &samples[self.start_positive], self.end - self.start_positive) # Update index_to_samples to take into account the sort diff --git a/sklearn/utils/_sorting.pxd b/sklearn/utils/_sorting.pxd index 51f21afd4d3e4..92dacd5e0d82c 100644 --- a/sklearn/utils/_sorting.pxd +++ b/sklearn/utils/_sorting.pxd @@ -1,9 +1,21 @@ +from cython cimport floating + from ._typedefs cimport intp_t -from cython cimport floating +cdef void simultaneous_quicksort( + floating* values, + intp_t* indices, + intp_t size, +) noexcept nogil + +cdef void simultaneous_introsort( + floating* values, + intp_t* indices, + intp_t size, +) noexcept nogil -cdef int simultaneous_sort( - floating *dist, - intp_t *idx, +cdef void simultaneous_heapsort( + floating* values, + intp_t* indices, intp_t size, ) noexcept nogil diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx index 13b2d872392b9..0e04b2efa9ea5 100644 --- a/sklearn/utils/_sorting.pyx +++ b/sklearn/utils/_sorting.pyx @@ -1,22 +1,135 @@ from cython cimport floating +from libc.math cimport log2 -cdef inline void dual_swap( - floating* darr, - intp_t *iarr, - intp_t a, - intp_t b, +from ._typedefs cimport intp_t + +# TODO: In order to support discrete distance metrics, we need to have a +# simultaneous sort which breaks ties on indices when distances are identical. +# The best might be using a std::stable_sort and a Comparator which might need +# an Array of Structures (AoS) instead of the Structure of Arrays (SoA) +# currently used. Alternatively, we can find a stable algorithm for SoA and +# adapt it so that it is simultaneous. + +# Utilities functions + + +def _simultaneous_sort( + floating[::1] values, + intp_t[::1] indices, + kind=None, +): + """Interface to simultaneous sorting algorithms. + + `values` and `indices` are sorted simultaneously based on increasing + order of elements in `values`. + + This interface exposes Cython implementations but is only meant to be + used for testing purposes. + + Parameters + ---------- + values : ndarray + 1-D Array of floating values to sort. + + indices : ndarray + Associated 1-D array of values' indices to sort. + + kind : str, default=None + Kind of the sorting algorithm to use. + Valid values for `kind` are in {'introsort', 'quicksort', 'heapsort'}. + If None, 'introsort' is used. + """ + cdef intp_t size = indices.shape[0] + + if kind is None: + kind = "introsort" + + if kind == "introsort": + return simultaneous_introsort(&values[0], &indices[0], size) + + if kind == "quicksort": + return simultaneous_quicksort(&values[0], &indices[0], size) + + if kind == "heapsort": + return simultaneous_heapsort(&values[0], &indices[0], size) + + raise ValueError(f"Currently kind='{kind}', but kind must be in ('introsort', 'quicksort', 'heapsort').") + +cdef inline void _simultaneous_swap( + floating* values, + intp_t* indices, + intp_t i, + intp_t j, +) noexcept nogil: + # Helper for sort + values[i], values[j] = values[j], values[i] + indices[i], indices[j] = indices[j], indices[i] + +cdef inline floating _median3( + floating* values, + intp_t size, +) noexcept nogil: + # Median of three pivot selection, after Bentley and McIlroy (1993). + # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. + cdef floating a = values[0], b = values[size / 2], c = values[size - 1] + if a < b: + if b < c: + return b + elif a < c: + return c + else: + return a + elif b < c: + if a < c: + return a + else: + return c + else: + return b + +cdef inline void _sift_down( + floating* values, + intp_t* indices, + intp_t start, + intp_t end, ) noexcept nogil: - """Swap the values at index a and b of both darr and iarr""" - cdef floating dtmp = darr[a] - darr[a] = darr[b] - darr[b] = dtmp + # Restore heap order in values[start:end] by moving the max element to start. + cdef intp_t child, maxind, root - cdef intp_t itmp = iarr[a] - iarr[a] = iarr[b] - iarr[b] = itmp + root = start + while True: + child = root * 2 + 1 + # find max of root, left child, right child + maxind = root + if child < end and values[maxind] < values[child]: + maxind = child + if child + 1 < end and values[maxind] < values[child + 1]: + maxind = child + 1 -cdef int simultaneous_sort( + if maxind == root: + break + else: + _simultaneous_swap(values, indices, root, maxind) + root = maxind + + +# Sorting functions + +cdef inline void simultaneous_introsort( + floating* values, + intp_t* indices, + intp_t size, +) noexcept nogil: + # Sort a Structure of Arrays pointed consisting of arrays of values and indices, + # simultaneously, based on the values. Algorithm: Introsort (Musser, SP&E, 1997). + if size == 0: + return + cdef int maxd = 2 * log2(size) + _simultaneous_introsort(values, indices, size, maxd) + + +cdef void simultaneous_quicksort( floating* values, intp_t* indices, intp_t size, @@ -36,11 +149,6 @@ cdef int simultaneous_sort( Arrays are manipulated via a pointer to there first element and their size as to ease the processing of dynamically allocated buffers. """ - # TODO: In order to support discrete distance metrics, we need to have a - # simultaneous sort which breaks ties on indices when distances are identical. - # The best might be using a std::stable_sort and a Comparator which might need - # an Array of Structures (AoS) instead of the Structure of Arrays (SoA) - # currently used. cdef: intp_t pivot_idx, i, store_idx floating pivot_val @@ -50,14 +158,14 @@ cdef int simultaneous_sort( pass elif size == 2: if values[0] > values[1]: - dual_swap(values, indices, 0, 1) + _simultaneous_swap(values, indices, 0, 1) elif size == 3: if values[0] > values[1]: - dual_swap(values, indices, 0, 1) + _simultaneous_swap(values, indices, 0, 1) if values[1] > values[2]: - dual_swap(values, indices, 1, 2) + _simultaneous_swap(values, indices, 1, 2) if values[0] > values[1]: - dual_swap(values, indices, 0, 1) + _simultaneous_swap(values, indices, 0, 1) else: # Determine the pivot using the median-of-three rule. # The smallest of the three is moved to the beginning of the array, @@ -65,11 +173,11 @@ cdef int simultaneous_sort( # is moved to the pivot index. pivot_idx = size // 2 if values[0] > values[size - 1]: - dual_swap(values, indices, 0, size - 1) + _simultaneous_swap(values, indices, 0, size - 1) if values[size - 1] > values[pivot_idx]: - dual_swap(values, indices, size - 1, pivot_idx) + _simultaneous_swap(values, indices, size - 1, pivot_idx) if values[0] > values[size - 1]: - dual_swap(values, indices, 0, size - 1) + _simultaneous_swap(values, indices, 0, size - 1) pivot_val = values[size - 1] # Partition indices about pivot. At the end of this operation, @@ -78,16 +186,77 @@ cdef int simultaneous_sort( store_idx = 0 for i in range(size - 1): if values[i] < pivot_val: - dual_swap(values, indices, i, store_idx) + _simultaneous_swap(values, indices, i, store_idx) store_idx += 1 - dual_swap(values, indices, store_idx, size - 1) + _simultaneous_swap(values, indices, store_idx, size - 1) pivot_idx = store_idx # Recursively sort each side of the pivot if pivot_idx > 1: - simultaneous_sort(values, indices, pivot_idx) + simultaneous_quicksort(values, indices, pivot_idx) if pivot_idx + 2 < size: - simultaneous_sort(values + pivot_idx + 1, - indices + pivot_idx + 1, - size - pivot_idx - 1) - return 0 + simultaneous_quicksort(values + pivot_idx + 1, + indices + pivot_idx + 1, + size - pivot_idx - 1) + + +# Introsort with median of 3 pivot selection and 3-way partition function +# (robust to repeated elements, e.g. lots of zero features). +cdef void _simultaneous_introsort( + floating* values, + intp_t* indices, + intp_t size, + int maxd, +) noexcept nogil: + cdef floating pivot + cdef intp_t i, l, r + + while size > 1: + if maxd <= 0: # max depth limit exceeded ("gone quadratic") + simultaneous_heapsort(values, indices, size) + return + maxd -= 1 + + pivot = _median3(values, size) + + # Three-way partition. + i = l = 0 + r = size + while i < r: + if values[i] < pivot: + _simultaneous_swap(values, indices, i, l) + i += 1 + l += 1 + elif values[i] > pivot: + r -= 1 + _simultaneous_swap(values, indices, i, r) + else: + i += 1 + + _simultaneous_introsort(values, indices, l, maxd) + values += r + indices += r + size -= r + +cdef void simultaneous_heapsort( + floating* values, + intp_t* indices, + intp_t size, +) noexcept nogil: + cdef intp_t start, end + + # heapify + start = (size - 2) / 2 + end = size + while True: + _sift_down(values, indices, start, end) + if start == 0: + break + start -= 1 + + # sort by shrinking the heap, putting the max element immediately after it + end = size - 1 + while end > 0: + _simultaneous_swap(values, indices, 0, end) + _sift_down(values, indices, 0, end) + end = end - 1 diff --git a/sklearn/utils/tests/test_sorting.py b/sklearn/utils/tests/test_sorting.py new file mode 100644 index 0000000000000..e5488080bed9c --- /dev/null +++ b/sklearn/utils/tests/test_sorting.py @@ -0,0 +1,37 @@ +import pytest +import numpy as np +from numpy.testing import assert_array_almost_equal + +from sklearn.utils._sorting import _simultaneous_sort + +from sklearn.utils import check_random_state + + +def test_simultaneous_sort_wrong_usage(): + rng = check_random_state(0) + values = rng.random_sample(10).astype(np.float64, copy=False) + indices = np.arange(10).astype(np.intp, copy=False) + + with pytest.raises(ValueError, match="Currently kind='nonexistent'"): + _simultaneous_sort(values, indices, kind="nonexistent") + + +@pytest.mark.parametrize("kind", ["introsort", "heapsort", "quicksort"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_simultaneous_sort(kind, dtype, global_random_seed, n_pts=201): + # Sort sanity check + rng = check_random_state(global_random_seed) + values = rng.random_sample(n_pts).astype(dtype, copy=False) + indices = np.arange(n_pts).astype(np.intp, copy=False) + + values_2 = values.copy() + indices_2 = indices.copy() + + _simultaneous_sort(values, indices, kind=kind) + + sorted_indices = np.argsort(values_2) + values_2 = values_2[sorted_indices] + indices_2 = indices_2[sorted_indices] + + assert_array_almost_equal(values, values_2) + assert_array_almost_equal(indices, indices_2)