scikit-learn · jjerphan · Dec 2, 2022 · Dec 2, 2022 · Dec 2, 2022 · Dec 2, 2022
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -6,7 +6,7 @@ from cython cimport final
 from cython.parallel cimport parallel, prange
 
 from ...utils._heap cimport heap_push
-from ...utils._sorting cimport simultaneous_sort
+from ...utils._sorting cimport sort
 from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 
 import numpy as np
@@ -194,7 +194,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
         # Sorting the main heaps portion associated to `X[X_start:X_end]`
         # in ascending order w.r.t the distances.
         for idx in range(X_end - X_start):
-            simultaneous_sort(
+            sort(
                 self.heaps_r_distances_chunks[thread_num] + idx * self.k,
                 self.heaps_indices_chunks[thread_num] + idx * self.k,
                 self.k
@@ -278,7 +278,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
             # Sorting the main in ascending order w.r.t the distances.
             # This is done in parallel sample-wise (no need for locks).
             for idx in prange(self.n_samples_X, schedule='static'):
-                simultaneous_sort(
+                sort(
                     &self.argkmin_distances[idx, 0],
                     &self.argkmin_indices[idx, 0],
                     self.k,

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -8,7 +8,7 @@ from cython cimport final
 from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
 
-from ...utils._sorting cimport simultaneous_sort
+from ...utils._sorting cimport sort
 from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 from ...utils._vector_sentinel cimport vector_to_nd_array
 
@@ -221,7 +221,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
         # Sorting neighbors for each query vector of X
         if self.sort_results:
             for idx in range(X_start, X_end):
-                simultaneous_sort(
+                sort(
                     deref(self.neigh_distances)[idx].data(),
                     deref(self.neigh_indices)[idx].data(),
                     deref(self.neigh_indices)[idx].size()
@@ -292,7 +292,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
             # Sort in parallel in ascending order w.r.t the distances if requested.
             if self.sort_results:
                 for idx in prange(self.n_samples_X, schedule='static'):
-                    simultaneous_sort(
+                    sort(
                         deref(self.neigh_distances)[idx].data(),
                         deref(self.neigh_indices)[idx].data(),
                         deref(self.neigh_indices)[idx].size()

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
@@ -20,6 +20,8 @@ import numpy as np
 
 from scipy.sparse import csc_matrix
 
+from ..utils._sorting cimport sort
+
 from ._utils cimport log
 from ._utils cimport rand_int
 from ._utils cimport rand_uniform
@@ -437,120 +439,6 @@ cdef class BestSplitter(BaseDenseSplitter):
         return 0
 
 
-# Sort n-element arrays pointed to by Xf and samples, simultaneously,
-# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997).
-cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
-    if n == 0:
-      return
-    cdef int maxd = 2 * <int>log(n)
-    introsort(Xf, samples, n, maxd)
-
-
-cdef inline void swap(DTYPE_t* Xf, SIZE_t* samples,
-        SIZE_t i, SIZE_t j) nogil:
-    # Helper for sort
-    Xf[i], Xf[j] = Xf[j], Xf[i]
-    samples[i], samples[j] = samples[j], samples[i]
-
-
-cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) nogil:
-    # Median of three pivot selection, after Bentley and McIlroy (1993).
-    # Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
-    cdef DTYPE_t a = Xf[0], b = Xf[n / 2], c = Xf[n - 1]
-    if a < b:
-        if b < c:
-            return b
-        elif a < c:
-            return c
-        else:
-            return a
-    elif b < c:
-        if a < c:
-            return a
-        else:
-            return c
-    else:
-        return b
-
-
-# Introsort with median of 3 pivot selection and 3-way partition function
-# (robust to repeated elements, e.g. lots of zero features).
-cdef void introsort(DTYPE_t* Xf, SIZE_t *samples,
-                    SIZE_t n, int maxd) nogil:
-    cdef DTYPE_t pivot
-    cdef SIZE_t i, l, r
-
-    while n > 1:
-        if maxd <= 0:   # max depth limit exceeded ("gone quadratic")
-            heapsort(Xf, samples, n)
-            return
-        maxd -= 1
-
-        pivot = median3(Xf, n)
-
-        # Three-way partition.
-        i = l = 0
-        r = n
-        while i < r:
-            if Xf[i] < pivot:
-                swap(Xf, samples, i, l)
-                i += 1
-                l += 1
-            elif Xf[i] > pivot:
-                r -= 1
-                swap(Xf, samples, i, r)
-            else:
-                i += 1
-
-        introsort(Xf, samples, l, maxd)
-        Xf += r
-        samples += r
-        n -= r
-
-
-cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples,
-                           SIZE_t start, SIZE_t end) nogil:
-    # Restore heap order in Xf[start:end] by moving the max element to start.
-    cdef SIZE_t child, maxind, root
-
-    root = start
-    while True:
-        child = root * 2 + 1
-
-        # find max of root, left child, right child
-        maxind = root
-        if child < end and Xf[maxind] < Xf[child]:
-            maxind = child
-        if child + 1 < end and Xf[maxind] < Xf[child + 1]:
-            maxind = child + 1
-
-        if maxind == root:
-            break
-        else:
-            swap(Xf, samples, root, maxind)
-            root = maxind
-
-
-cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
-    cdef SIZE_t start, end
-
-    # heapify
-    start = (n - 2) / 2
-    end = n
-    while True:
-        sift_down(Xf, samples, start, end)
-        if start == 0:
-            break
-        start -= 1
-
-    # sort by shrinking the heap, putting the max element immediately after it
-    end = n - 1
-    while end > 0:
-        swap(Xf, samples, 0, end)
-        sift_down(Xf, samples, 0, end)
-        end = end - 1
-
-
 cdef class RandomSplitter(BaseDenseSplitter):
     """Splitter for finding the best random split."""
     def __reduce__(self):

diff --git a/sklearn/utils/_sorting.pxd b/sklearn/utils/_sorting.pxd
@@ -1,4 +1,5 @@
-from ._typedefs cimport DTYPE_t, ITYPE_t
+cimport numpy as cnp
+from ._typedefs cimport ITYPE_t
 
 from cython cimport floating
 
@@ -7,3 +8,5 @@ cdef int simultaneous_sort(
     ITYPE_t *idx,
     ITYPE_t size,
 ) nogil
+
+cdef void sort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil
diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx
@@ -1,4 +1,12 @@
+cimport numpy as cnp
+
 from cython cimport floating
+from libc.math cimport log as ln
+
+# TODO: Factor code also present in `tree._utils` or use `libc.math.log2` directly
+cdef inline double log(double x) nogil:
+    return ln(x) / ln(2.0)
+
 
 cdef inline void dual_swap(
     floating* darr,
@@ -91,3 +99,124 @@ cdef int simultaneous_sort(
                               indices + pivot_idx + 1,
                               size - pivot_idx - 1)
     return 0
+
+
+# Sort n-element arrays pointed to by Xf and samples, simultaneously,
+# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997).
+cdef inline void sort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil:
+    if n == 0:
+      return
+    cdef int maxd = 2 * <int>log(n)
+    introsort(Xf, samples, n, maxd)
+
+
+cdef inline void swap(
+    floating* Xf,
+    cnp.npy_intp* samples,
+    cnp.npy_intp i,
+    cnp.npy_intp j,
+) nogil:
+    # Helper for sort
+    Xf[i], Xf[j] = Xf[j], Xf[i]
+    samples[i], samples[j] = samples[j], samples[i]
+
+
+cdef inline floating median3(floating* Xf, cnp.npy_intp n) nogil:
+    # Median of three pivot selection, after Bentley and McIlroy (1993).
+    # Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
+    cdef floating a = Xf[0], b = Xf[n / 2], c = Xf[n - 1]
+    if a < b:
+        if b < c:
+            return b
+        elif a < c:
+            return c
+        else:
+            return a
+    elif b < c:
+        if a < c:
+            return a
+        else:
+            return c
+    else:
+        return b
+
+
+# Introsort with median of 3 pivot selection and 3-way partition function
+# (robust to repeated elements, e.g. lots of zero features).
+cdef void introsort(floating* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int maxd) nogil:
+    cdef floating pivot
+    cdef cnp.npy_intp i, l, r
+
+    while n > 1:
+        if maxd <= 0:   # max depth limit exceeded ("gone quadratic")
+            heapsort(Xf, samples, n)
+            return
+        maxd -= 1
+
+        pivot = median3(Xf, n)
+
+        # Three-way partition.
+        i = l = 0
+        r = n
+        while i < r:
+            if Xf[i] < pivot:
+                swap(Xf, samples, i, l)
+                i += 1
+                l += 1
+            elif Xf[i] > pivot:
+                r -= 1
+                swap(Xf, samples, i, r)
+            else:
+                i += 1
+
+        introsort(Xf, samples, l, maxd)
+        Xf += r
+        samples += r
+        n -= r
+
+
+cdef inline void sift_down(
+    floating* Xf,
+    cnp.npy_intp* samples,
+    cnp.npy_intp start,
+    cnp.npy_intp end,
+) nogil:
+    # Restore heap order in Xf[start:end] by moving the max element to start.
+    cdef cnp.npy_intp child, maxind, root
+
+    root = start
+    while True:
+        child = root * 2 + 1
+
+        # find max of root, left child, right child
+        maxind = root
+        if child < end and Xf[maxind] < Xf[child]:
+            maxind = child
+        if child + 1 < end and Xf[maxind] < Xf[child + 1]:
+            maxind = child + 1
+
+        if maxind == root:
+            break
+        else:
+            swap(Xf, samples, root, maxind)
+            root = maxind
+
+
+cdef void heapsort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil:
+    cdef cnp.npy_intp start, end
+
+    # heapify
+    start = (n - 2) / 2
+    end = n
+    while True:
+        sift_down(Xf, samples, start, end)
+        if start == 0:
+            break
+        start -= 1
+
+    # sort by shrinking the heap, putting the max element immediately after it
+    end = n - 1
+    while end > 0:
+        swap(Xf, samples, 0, end)
+        sift_down(Xf, samples, 0, end)
+        end = end - 1