diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 7edc64c59a050..f2d122c6f1826 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -4,7 +4,7 @@ from cython cimport final
 from cython.parallel cimport parallel, prange
 
 from ...utils._heap cimport heap_push
-from ...utils._sorting cimport simultaneous_sort
+from ...utils._sorting cimport simultaneous_quicksort
 from ...utils._typedefs cimport intp_t, float64_t
 
 import numpy as np
@@ -184,7 +184,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
         # Sorting the main heaps portion associated to `X[X_start:X_end]`
         # in ascending order w.r.t the distances.
         for idx in range(X_end - X_start):
-            simultaneous_sort(
+            simultaneous_quicksort(
                 self.heaps_r_distances_chunks[thread_num] + idx * self.k,
                 self.heaps_indices_chunks[thread_num] + idx * self.k,
                 self.k
@@ -268,7 +268,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
             # Sorting the main in ascending order w.r.t the distances.
             # This is done in parallel sample-wise (no need for locks).
             for idx in prange(self.n_samples_X, schedule='static'):
-                simultaneous_sort(
+                simultaneous_quicksort(
                     &self.argkmin_distances[idx, 0],
                     &self.argkmin_indices[idx, 0],
                     self.k,
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
index 1defa30b6325e..2d0f8819c4998 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -8,7 +8,7 @@ from cython cimport final
 from cython.operator cimport dereference as deref
 from cython.parallel cimport parallel, prange
 
-from ...utils._sorting cimport simultaneous_sort
+from ...utils._sorting cimport simultaneous_quicksort
 from ...utils._typedefs cimport intp_t, float64_t
 from ...utils._vector_sentinel cimport vector_to_nd_array
 
@@ -218,7 +218,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
         # Sorting neighbors for each query vector of X
         if self.sort_results:
             for idx in range(X_start, X_end):
-                simultaneous_sort(
+                simultaneous_quicksort(
                     deref(self.neigh_distances)[idx].data(),
                     deref(self.neigh_indices)[idx].data(),
                     deref(self.neigh_indices)[idx].size()
@@ -289,7 +289,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
             # Sort in parallel in ascending order w.r.t the distances if requested.
             if self.sort_results:
                 for idx in prange(self.n_samples_X, schedule='static'):
-                    simultaneous_sort(
+                    simultaneous_quicksort(
                         deref(self.neigh_distances)[idx].data(),
                         deref(self.neigh_indices)[idx].data(),
                         deref(self.neigh_indices)[idx].size()
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index a03b7cdec2294..7078ea629a4d1 100644
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -163,7 +163,7 @@ from ._partition_nodes cimport partition_node_indices
 from ..utils import check_array
 from ..utils._typedefs cimport float64_t, intp_t
 from ..utils._heap cimport heap_push
-from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort
+from ..utils._sorting cimport simultaneous_quicksort
 
 cnp.import_array()
 
@@ -561,9 +561,9 @@ cdef class NeighborsHeap:
         """simultaneously sort the distances and indices"""
         cdef intp_t row
         for row in range(self.distances.shape[0]):
-            _simultaneous_sort(
-                dist=&self.distances[row, 0],
-                idx=&self.indices[row, 0],
+            simultaneous_quicksort(
+                values=&self.distances[row, 0],
+                indices=&self.indices[row, 0],
                 size=self.distances.shape[1],
             )
         return 0
@@ -1305,8 +1305,11 @@ cdef class BinaryTree:
                     continue
 
                 if sort_results:
-                    _simultaneous_sort(&dist_arr_i[0], &idx_arr_i[0],
-                                       counts[i])
+                    simultaneous_quicksort(
+                        &dist_arr_i[0],
+                        &idx_arr_i[0],
+                        counts[i],
+                    )
 
                 # equivalent to: indices[i] = np_idx_arr[:counts[i]].copy()
                 indices[i] = <intp_t*>malloc(counts[i] * sizeof(intp_t))
@@ -2388,15 +2391,17 @@ def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices):
     """In-place simultaneous sort the given row of the arrays
 
     This python wrapper exists primarily to enable unit testing
-    of the _simultaneous_sort C routine.
+    of the simultaneous_quicksort C routine.
     """
     assert distances.shape[0] == indices.shape[0]
     assert distances.shape[1] == indices.shape[1]
     cdef intp_t row
     for row in range(distances.shape[0]):
-        _simultaneous_sort(&distances[row, 0],
-                           &indices[row, 0],
-                           distances.shape[1])
+        simultaneous_quicksort(
+            &distances[row, 0],
+            &indices[row, 0],
+            distances.shape[1],
+        )
 
 
 def nodeheap_sort(float64_t[::1] vals):
diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py
index d485e799eb5b0..72648ce937b97 100644
--- a/sklearn/neighbors/tests/test_neighbors_tree.py
+++ b/sklearn/neighbors/tests/test_neighbors_tree.py
@@ -11,13 +11,11 @@
     BallTree,
     kernel_norm,
     NeighborsHeap as NeighborsHeapBT,
-    simultaneous_sort as simultaneous_sort_bt,
     nodeheap_sort as nodeheap_sort_bt,
 )
 from sklearn.neighbors._kd_tree import (
     KDTree,
     NeighborsHeap as NeighborsHeapKDT,
-    simultaneous_sort as simultaneous_sort_kdt,
     nodeheap_sort as nodeheap_sort_kdt,
 )
 
@@ -188,30 +186,6 @@ def test_node_heap(nodeheap_sort, n_nodes=50):
     assert_array_almost_equal(vals[i1], vals2)
 
 
-@pytest.mark.parametrize(
-    "simultaneous_sort", [simultaneous_sort_bt, simultaneous_sort_kdt]
-)
-def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201):
-    rng = check_random_state(0)
-    dist = rng.random_sample((n_rows, n_pts)).astype(np.float64, copy=False)
-    ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(np.intp, copy=False)
-
-    dist2 = dist.copy()
-    ind2 = ind.copy()
-
-    # simultaneous sort rows using function
-    simultaneous_sort(dist, ind)
-
-    # simultaneous sort rows using numpy
-    i = np.argsort(dist2, axis=1)
-    row_ind = np.arange(n_rows)[:, None]
-    dist2 = dist2[row_ind, i]
-    ind2 = ind2[row_ind, i]
-
-    assert_array_almost_equal(dist, dist2)
-    assert_array_almost_equal(ind, ind2)
-
-
 @pytest.mark.parametrize("Cls", [KDTree, BallTree])
 def test_gaussian_kde(Cls, n_samples=1000):
     # Compare gaussian KDE results to scipy.stats.gaussian_kde
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 83a80d90cc1b9..9c1d8d369e5ce 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -21,6 +21,10 @@ import numpy as np
 
 from scipy.sparse import csc_matrix
 
+from ..utils._sorting cimport simultaneous_introsort as sort
+# TODO: when Cython>=3.0 is used, remove the <intp_t*> casts in call to sort.
+from ..utils._typedefs cimport intp_t
+
 from ._utils cimport log
 from ._utils cimport rand_int
 from ._utils cimport rand_uniform
@@ -413,119 +417,6 @@ cdef inline int node_split_best(
     return 0
 
 
-# Sort n-element arrays pointed to by feature_values and samples, simultaneously,
-# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997).
-cdef inline void sort(DTYPE_t* feature_values, SIZE_t* samples, SIZE_t n) noexcept nogil:
-    if n == 0:
-      return
-    cdef int maxd = 2 * <int>log(n)
-    introsort(feature_values, samples, n, maxd)
-
-
-cdef inline void swap(DTYPE_t* feature_values, SIZE_t* samples,
-        SIZE_t i, SIZE_t j) noexcept nogil:
-    # Helper for sort
-    feature_values[i], feature_values[j] = feature_values[j], feature_values[i]
-    samples[i], samples[j] = samples[j], samples[i]
-
-
-cdef inline DTYPE_t median3(DTYPE_t* feature_values, SIZE_t n) noexcept nogil:
-    # Median of three pivot selection, after Bentley and McIlroy (1993).
-    # Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
-    cdef DTYPE_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1]
-    if a < b:
-        if b < c:
-            return b
-        elif a < c:
-            return c
-        else:
-            return a
-    elif b < c:
-        if a < c:
-            return a
-        else:
-            return c
-    else:
-        return b
-
-
-# Introsort with median of 3 pivot selection and 3-way partition function
-# (robust to repeated elements, e.g. lots of zero features).
-cdef void introsort(DTYPE_t* feature_values, SIZE_t *samples,
-                    SIZE_t n, int maxd) noexcept nogil:
-    cdef DTYPE_t pivot
-    cdef SIZE_t i, l, r
-
-    while n > 1:
-        if maxd <= 0:   # max depth limit exceeded ("gone quadratic")
-            heapsort(feature_values, samples, n)
-            return
-        maxd -= 1
-
-        pivot = median3(feature_values, n)
-
-        # Three-way partition.
-        i = l = 0
-        r = n
-        while i < r:
-            if feature_values[i] < pivot:
-                swap(feature_values, samples, i, l)
-                i += 1
-                l += 1
-            elif feature_values[i] > pivot:
-                r -= 1
-                swap(feature_values, samples, i, r)
-            else:
-                i += 1
-
-        introsort(feature_values, samples, l, maxd)
-        feature_values += r
-        samples += r
-        n -= r
-
-
-cdef inline void sift_down(DTYPE_t* feature_values, SIZE_t* samples,
-                           SIZE_t start, SIZE_t end) noexcept nogil:
-    # Restore heap order in feature_values[start:end] by moving the max element to start.
-    cdef SIZE_t child, maxind, root
-
-    root = start
-    while True:
-        child = root * 2 + 1
-
-        # find max of root, left child, right child
-        maxind = root
-        if child < end and feature_values[maxind] < feature_values[child]:
-            maxind = child
-        if child + 1 < end and feature_values[maxind] < feature_values[child + 1]:
-            maxind = child + 1
-
-        if maxind == root:
-            break
-        else:
-            swap(feature_values, samples, root, maxind)
-            root = maxind
-
-
-cdef void heapsort(DTYPE_t* feature_values, SIZE_t* samples, SIZE_t n) noexcept nogil:
-    cdef SIZE_t start, end
-
-    # heapify
-    start = (n - 2) / 2
-    end = n
-    while True:
-        sift_down(feature_values, samples, start, end)
-        if start == 0:
-            break
-        start -= 1
-
-    # sort by shrinking the heap, putting the max element immediately after it
-    end = n - 1
-    while end > 0:
-        swap(feature_values, samples, 0, end)
-        sift_down(feature_values, samples, 0, end)
-        end = end - 1
-
 cdef inline int node_split_random(
     Splitter splitter,
     Partitioner partitioner,
@@ -742,7 +633,7 @@ cdef class DensePartitioner:
         # effectively.
         for i in range(self.start, self.end):
             feature_values[i] = X[samples[i], current_feature]
-        sort(&feature_values[self.start], &samples[self.start], self.end - self.start)
+        sort(&feature_values[self.start], <intp_t*>&samples[self.start], self.end - self.start)
 
     cdef inline void find_min_max(
         self,
@@ -901,9 +792,9 @@ cdef class SparsePartitioner:
 
         self.extract_nnz(current_feature)
         # Sort the positive and negative parts of `feature_values`
-        sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start)
+        sort(&feature_values[self.start], <intp_t*>&samples[self.start], self.end_negative - self.start)
         if self.start_positive < self.end:
-            sort(&feature_values[self.start_positive], &samples[self.start_positive],
+            sort(&feature_values[self.start_positive], <intp_t*>&samples[self.start_positive],
                     self.end - self.start_positive)
 
         # Update index_to_samples to take into account the sort
diff --git a/sklearn/utils/_sorting.pxd b/sklearn/utils/_sorting.pxd
index 51f21afd4d3e4..92dacd5e0d82c 100644
--- a/sklearn/utils/_sorting.pxd
+++ b/sklearn/utils/_sorting.pxd
@@ -1,9 +1,21 @@
+from cython cimport floating
+
 from ._typedefs cimport intp_t
 
-from cython cimport floating
+cdef void simultaneous_quicksort(
+    floating* values,
+    intp_t* indices,
+    intp_t size,
+) noexcept nogil
+
+cdef void simultaneous_introsort(
+    floating* values,
+    intp_t* indices,
+    intp_t size,
+) noexcept nogil
 
-cdef int simultaneous_sort(
-    floating *dist,
-    intp_t *idx,
+cdef void simultaneous_heapsort(
+    floating* values,
+    intp_t* indices,
     intp_t size,
 ) noexcept nogil
diff --git a/sklearn/utils/_sorting.pyx b/sklearn/utils/_sorting.pyx
index 13b2d872392b9..0e04b2efa9ea5 100644
--- a/sklearn/utils/_sorting.pyx
+++ b/sklearn/utils/_sorting.pyx
@@ -1,22 +1,135 @@
 from cython cimport floating
+from libc.math cimport log2
 
-cdef inline void dual_swap(
-    floating* darr,
-    intp_t *iarr,
-    intp_t a,
-    intp_t b,
+from ._typedefs cimport intp_t
+
+# TODO: In order to support discrete distance metrics, we need to have a
+# simultaneous sort which breaks ties on indices when distances are identical.
+# The best might be using a std::stable_sort and a Comparator which might need
+# an Array of Structures (AoS) instead of the Structure of Arrays (SoA)
+# currently used. Alternatively, we can find a stable algorithm for SoA and
+# adapt it so that it is simultaneous.
+
+# Utilities functions
+
+
+def _simultaneous_sort(
+    floating[::1] values,
+    intp_t[::1] indices,
+    kind=None,
+):
+    """Interface to simultaneous sorting algorithms.
+
+    `values` and `indices` are sorted simultaneously based on increasing
+    order of elements in `values`.
+
+    This interface exposes Cython implementations but is only meant to be
+    used for testing purposes.
+
+    Parameters
+    ----------
+    values : ndarray
+        1-D Array of floating values to sort.
+
+    indices : ndarray
+        Associated 1-D array of values' indices to sort.
+
+    kind : str, default=None
+        Kind of the sorting algorithm to use.
+        Valid values for `kind` are in {'introsort', 'quicksort', 'heapsort'}.
+        If None, 'introsort' is used.
+    """
+    cdef intp_t size = indices.shape[0]
+
+    if kind is None:
+        kind = "introsort"
+
+    if kind == "introsort":
+        return simultaneous_introsort(&values[0], &indices[0], size)
+
+    if kind == "quicksort":
+        return simultaneous_quicksort(&values[0], &indices[0], size)
+
+    if kind == "heapsort":
+        return simultaneous_heapsort(&values[0], &indices[0], size)
+
+    raise ValueError(f"Currently kind='{kind}', but kind must be in ('introsort', 'quicksort', 'heapsort').")
+
+cdef inline void _simultaneous_swap(
+    floating* values,
+    intp_t* indices,
+    intp_t i,
+    intp_t j,
+) noexcept nogil:
+    # Helper for sort
+    values[i], values[j] = values[j], values[i]
+    indices[i], indices[j] = indices[j], indices[i]
+
+cdef inline floating _median3(
+    floating* values,
+    intp_t size,
+) noexcept nogil:
+    # Median of three pivot selection, after Bentley and McIlroy (1993).
+    # Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
+    cdef floating a = values[0], b = values[size / 2], c = values[size - 1]
+    if a < b:
+        if b < c:
+            return b
+        elif a < c:
+            return c
+        else:
+            return a
+    elif b < c:
+        if a < c:
+            return a
+        else:
+            return c
+    else:
+        return b
+
+cdef inline void _sift_down(
+    floating* values,
+    intp_t* indices,
+    intp_t start,
+    intp_t end,
 ) noexcept nogil:
-    """Swap the values at index a and b of both darr and iarr"""
-    cdef floating dtmp = darr[a]
-    darr[a] = darr[b]
-    darr[b] = dtmp
+    # Restore heap order in values[start:end] by moving the max element to start.
+    cdef intp_t child, maxind, root
 
-    cdef intp_t itmp = iarr[a]
-    iarr[a] = iarr[b]
-    iarr[b] = itmp
+    root = start
+    while True:
+        child = root * 2 + 1
 
+        # find max of root, left child, right child
+        maxind = root
+        if child < end and values[maxind] < values[child]:
+            maxind = child
+        if child + 1 < end and values[maxind] < values[child + 1]:
+            maxind = child + 1
 
-cdef int simultaneous_sort(
+        if maxind == root:
+            break
+        else:
+            _simultaneous_swap(values, indices, root, maxind)
+            root = maxind
+
+
+# Sorting functions
+
+cdef inline void simultaneous_introsort(
+    floating* values,
+    intp_t* indices,
+    intp_t size,
+) noexcept nogil:
+    # Sort a Structure of Arrays pointed consisting of arrays of values and indices,
+    # simultaneously, based on the values. Algorithm: Introsort (Musser, SP&E, 1997).
+    if size == 0:
+      return
+    cdef int maxd = 2 * <int>log2(size)
+    _simultaneous_introsort(values, indices, size, maxd)
+
+
+cdef void simultaneous_quicksort(
     floating* values,
     intp_t* indices,
     intp_t size,
@@ -36,11 +149,6 @@ cdef int simultaneous_sort(
     Arrays are manipulated via a pointer to there first element and their size
     as to ease the processing of dynamically allocated buffers.
     """
-    # TODO: In order to support discrete distance metrics, we need to have a
-    # simultaneous sort which breaks ties on indices when distances are identical.
-    # The best might be using a std::stable_sort and a Comparator which might need
-    # an Array of Structures (AoS) instead of the Structure of Arrays (SoA)
-    # currently used.
     cdef:
         intp_t pivot_idx, i, store_idx
         floating pivot_val
@@ -50,14 +158,14 @@ cdef int simultaneous_sort(
         pass
     elif size == 2:
         if values[0] > values[1]:
-            dual_swap(values, indices, 0, 1)
+            _simultaneous_swap(values, indices, 0, 1)
     elif size == 3:
         if values[0] > values[1]:
-            dual_swap(values, indices, 0, 1)
+            _simultaneous_swap(values, indices, 0, 1)
         if values[1] > values[2]:
-            dual_swap(values, indices, 1, 2)
+            _simultaneous_swap(values, indices, 1, 2)
             if values[0] > values[1]:
-                dual_swap(values, indices, 0, 1)
+                _simultaneous_swap(values, indices, 0, 1)
     else:
         # Determine the pivot using the median-of-three rule.
         # The smallest of the three is moved to the beginning of the array,
@@ -65,11 +173,11 @@ cdef int simultaneous_sort(
         # is moved to the pivot index.
         pivot_idx = size // 2
         if values[0] > values[size - 1]:
-            dual_swap(values, indices, 0, size - 1)
+            _simultaneous_swap(values, indices, 0, size - 1)
         if values[size - 1] > values[pivot_idx]:
-            dual_swap(values, indices, size - 1, pivot_idx)
+            _simultaneous_swap(values, indices, size - 1, pivot_idx)
             if values[0] > values[size - 1]:
-                dual_swap(values, indices, 0, size - 1)
+                _simultaneous_swap(values, indices, 0, size - 1)
         pivot_val = values[size - 1]
 
         # Partition indices about pivot.  At the end of this operation,
@@ -78,16 +186,77 @@ cdef int simultaneous_sort(
         store_idx = 0
         for i in range(size - 1):
             if values[i] < pivot_val:
-                dual_swap(values, indices, i, store_idx)
+                _simultaneous_swap(values, indices, i, store_idx)
                 store_idx += 1
-        dual_swap(values, indices, store_idx, size - 1)
+        _simultaneous_swap(values, indices, store_idx, size - 1)
         pivot_idx = store_idx
 
         # Recursively sort each side of the pivot
         if pivot_idx > 1:
-            simultaneous_sort(values, indices, pivot_idx)
+            simultaneous_quicksort(values, indices, pivot_idx)
         if pivot_idx + 2 < size:
-            simultaneous_sort(values + pivot_idx + 1,
-                              indices + pivot_idx + 1,
-                              size - pivot_idx - 1)
-    return 0
+            simultaneous_quicksort(values + pivot_idx + 1,
+                                    indices + pivot_idx + 1,
+                                    size - pivot_idx - 1)
+
+
+# Introsort with median of 3 pivot selection and 3-way partition function
+# (robust to repeated elements, e.g. lots of zero features).
+cdef void _simultaneous_introsort(
+    floating* values,
+    intp_t* indices,
+    intp_t size,
+    int maxd,
+) noexcept nogil:
+    cdef floating pivot
+    cdef intp_t i, l, r
+
+    while size > 1:
+        if maxd <= 0:   # max depth limit exceeded ("gone quadratic")
+            simultaneous_heapsort(values, indices, size)
+            return
+        maxd -= 1
+
+        pivot = _median3(values, size)
+
+        # Three-way partition.
+        i = l = 0
+        r = size
+        while i < r:
+            if values[i] < pivot:
+                _simultaneous_swap(values, indices, i, l)
+                i += 1
+                l += 1
+            elif values[i] > pivot:
+                r -= 1
+                _simultaneous_swap(values, indices, i, r)
+            else:
+                i += 1
+
+        _simultaneous_introsort(values, indices, l, maxd)
+        values += r
+        indices += r
+        size -= r
+
+cdef void simultaneous_heapsort(
+    floating* values,
+    intp_t* indices,
+    intp_t size,
+) noexcept nogil:
+    cdef intp_t start, end
+
+    # heapify
+    start = (size - 2) / 2
+    end = size
+    while True:
+        _sift_down(values, indices, start, end)
+        if start == 0:
+            break
+        start -= 1
+
+    # sort by shrinking the heap, putting the max element immediately after it
+    end = size - 1
+    while end > 0:
+        _simultaneous_swap(values, indices, 0, end)
+        _sift_down(values, indices, 0, end)
+        end = end - 1
diff --git a/sklearn/utils/tests/test_sorting.py b/sklearn/utils/tests/test_sorting.py
new file mode 100644
index 0000000000000..e5488080bed9c
--- /dev/null
+++ b/sklearn/utils/tests/test_sorting.py
@@ -0,0 +1,37 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_array_almost_equal
+
+from sklearn.utils._sorting import _simultaneous_sort
+
+from sklearn.utils import check_random_state
+
+
+def test_simultaneous_sort_wrong_usage():
+    rng = check_random_state(0)
+    values = rng.random_sample(10).astype(np.float64, copy=False)
+    indices = np.arange(10).astype(np.intp, copy=False)
+
+    with pytest.raises(ValueError, match="Currently kind='nonexistent'"):
+        _simultaneous_sort(values, indices, kind="nonexistent")
+
+
+@pytest.mark.parametrize("kind", ["introsort", "heapsort", "quicksort"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_simultaneous_sort(kind, dtype, global_random_seed, n_pts=201):
+    # Sort sanity check
+    rng = check_random_state(global_random_seed)
+    values = rng.random_sample(n_pts).astype(dtype, copy=False)
+    indices = np.arange(n_pts).astype(np.intp, copy=False)
+
+    values_2 = values.copy()
+    indices_2 = indices.copy()
+
+    _simultaneous_sort(values, indices, kind=kind)
+
+    sorted_indices = np.argsort(values_2)
+    values_2 = values_2[sorted_indices]
+    indices_2 = indices_2[sorted_indices]
+
+    assert_array_almost_equal(values, values_2)
+    assert_array_almost_equal(indices, indices_2)