Thanks to visit codestin.com
Credit goes to github.com

Skip to content

FEA PairwiseDistancesReductions: support for Boolean DistanceMetrics via stable simultaneous sort #25097

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from cython cimport final
from cython.parallel cimport parallel, prange

from ...utils._heap cimport heap_push
from ...utils._sorting cimport simultaneous_sort
from ...utils._sorting cimport sort
from ...utils._typedefs cimport ITYPE_t, DTYPE_t

import numpy as np
Expand Down Expand Up @@ -194,7 +194,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
# Sorting the main heaps portion associated to `X[X_start:X_end]`
# in ascending order w.r.t the distances.
for idx in range(X_end - X_start):
simultaneous_sort(
sort(
self.heaps_r_distances_chunks[thread_num] + idx * self.k,
self.heaps_indices_chunks[thread_num] + idx * self.k,
self.k
Expand Down Expand Up @@ -278,7 +278,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
# Sorting the main in ascending order w.r.t the distances.
# This is done in parallel sample-wise (no need for locks).
for idx in prange(self.n_samples_X, schedule='static'):
simultaneous_sort(
sort(
&self.argkmin_distances[idx, 0],
&self.argkmin_indices[idx, 0],
self.k,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from cython cimport final
from cython.operator cimport dereference as deref
from cython.parallel cimport parallel, prange

from ...utils._sorting cimport simultaneous_sort
from ...utils._sorting cimport sort
from ...utils._typedefs cimport ITYPE_t, DTYPE_t
from ...utils._vector_sentinel cimport vector_to_nd_array

Expand Down Expand Up @@ -221,7 +221,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
# Sorting neighbors for each query vector of X
if self.sort_results:
for idx in range(X_start, X_end):
simultaneous_sort(
sort(
deref(self.neigh_distances)[idx].data(),
deref(self.neigh_indices)[idx].data(),
deref(self.neigh_indices)[idx].size()
Expand Down Expand Up @@ -292,7 +292,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
# Sort in parallel in ascending order w.r.t the distances if requested.
if self.sort_results:
for idx in prange(self.n_samples_X, schedule='static'):
simultaneous_sort(
sort(
deref(self.neigh_distances)[idx].data(),
deref(self.neigh_indices)[idx].data(),
deref(self.neigh_indices)[idx].size()
Expand Down
116 changes: 2 additions & 114 deletions sklearn/tree/_splitter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import numpy as np

from scipy.sparse import csc_matrix

from ..utils._sorting cimport sort

from ._utils cimport log
from ._utils cimport rand_int
from ._utils cimport rand_uniform
Expand Down Expand Up @@ -437,120 +439,6 @@ cdef class BestSplitter(BaseDenseSplitter):
return 0


# Sort n-element arrays pointed to by Xf and samples, simultaneously,
# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997).
cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
if n == 0:
return
cdef int maxd = 2 * <int>log(n)
introsort(Xf, samples, n, maxd)


cdef inline void swap(DTYPE_t* Xf, SIZE_t* samples,
SIZE_t i, SIZE_t j) nogil:
# Helper for sort
Xf[i], Xf[j] = Xf[j], Xf[i]
samples[i], samples[j] = samples[j], samples[i]


cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) nogil:
# Median of three pivot selection, after Bentley and McIlroy (1993).
# Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
cdef DTYPE_t a = Xf[0], b = Xf[n / 2], c = Xf[n - 1]
if a < b:
if b < c:
return b
elif a < c:
return c
else:
return a
elif b < c:
if a < c:
return a
else:
return c
else:
return b


# Introsort with median of 3 pivot selection and 3-way partition function
# (robust to repeated elements, e.g. lots of zero features).
cdef void introsort(DTYPE_t* Xf, SIZE_t *samples,
SIZE_t n, int maxd) nogil:
cdef DTYPE_t pivot
cdef SIZE_t i, l, r

while n > 1:
if maxd <= 0: # max depth limit exceeded ("gone quadratic")
heapsort(Xf, samples, n)
return
maxd -= 1

pivot = median3(Xf, n)

# Three-way partition.
i = l = 0
r = n
while i < r:
if Xf[i] < pivot:
swap(Xf, samples, i, l)
i += 1
l += 1
elif Xf[i] > pivot:
r -= 1
swap(Xf, samples, i, r)
else:
i += 1

introsort(Xf, samples, l, maxd)
Xf += r
samples += r
n -= r


cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples,
SIZE_t start, SIZE_t end) nogil:
# Restore heap order in Xf[start:end] by moving the max element to start.
cdef SIZE_t child, maxind, root

root = start
while True:
child = root * 2 + 1

# find max of root, left child, right child
maxind = root
if child < end and Xf[maxind] < Xf[child]:
maxind = child
if child + 1 < end and Xf[maxind] < Xf[child + 1]:
maxind = child + 1

if maxind == root:
break
else:
swap(Xf, samples, root, maxind)
root = maxind


cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
cdef SIZE_t start, end

# heapify
start = (n - 2) / 2
end = n
while True:
sift_down(Xf, samples, start, end)
if start == 0:
break
start -= 1

# sort by shrinking the heap, putting the max element immediately after it
end = n - 1
while end > 0:
swap(Xf, samples, 0, end)
sift_down(Xf, samples, 0, end)
end = end - 1


cdef class RandomSplitter(BaseDenseSplitter):
"""Splitter for finding the best random split."""
def __reduce__(self):
Expand Down
5 changes: 4 additions & 1 deletion sklearn/utils/_sorting.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ._typedefs cimport DTYPE_t, ITYPE_t
cimport numpy as cnp
from ._typedefs cimport ITYPE_t

from cython cimport floating

Expand All @@ -7,3 +8,5 @@ cdef int simultaneous_sort(
ITYPE_t *idx,
ITYPE_t size,
) nogil

cdef void sort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil
129 changes: 129 additions & 0 deletions sklearn/utils/_sorting.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
cimport numpy as cnp

from cython cimport floating
from libc.math cimport log as ln

# TODO: Factor code also present in `tree._utils` or use `libc.math.log2` directly
cdef inline double log(double x) nogil:
return ln(x) / ln(2.0)


cdef inline void dual_swap(
floating* darr,
Expand Down Expand Up @@ -91,3 +99,124 @@ cdef int simultaneous_sort(
indices + pivot_idx + 1,
size - pivot_idx - 1)
return 0


# Sort n-element arrays pointed to by Xf and samples, simultaneously,
# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997).
cdef inline void sort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil:
if n == 0:
return
cdef int maxd = 2 * <int>log(n)
introsort(Xf, samples, n, maxd)


cdef inline void swap(
floating* Xf,
cnp.npy_intp* samples,
cnp.npy_intp i,
cnp.npy_intp j,
) nogil:
# Helper for sort
Xf[i], Xf[j] = Xf[j], Xf[i]
samples[i], samples[j] = samples[j], samples[i]


cdef inline floating median3(floating* Xf, cnp.npy_intp n) nogil:
# Median of three pivot selection, after Bentley and McIlroy (1993).
# Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
cdef floating a = Xf[0], b = Xf[n / 2], c = Xf[n - 1]
if a < b:
if b < c:
return b
elif a < c:
return c
else:
return a
elif b < c:
if a < c:
return a
else:
return c
else:
return b


# Introsort with median of 3 pivot selection and 3-way partition function
# (robust to repeated elements, e.g. lots of zero features).
cdef void introsort(floating* Xf, cnp.npy_intp *samples, cnp.npy_intp n, int maxd) nogil:
cdef floating pivot
cdef cnp.npy_intp i, l, r

while n > 1:
if maxd <= 0: # max depth limit exceeded ("gone quadratic")
heapsort(Xf, samples, n)
return
maxd -= 1

pivot = median3(Xf, n)

# Three-way partition.
i = l = 0
r = n
while i < r:
if Xf[i] < pivot:
swap(Xf, samples, i, l)
i += 1
l += 1
elif Xf[i] > pivot:
r -= 1
swap(Xf, samples, i, r)
else:
i += 1

introsort(Xf, samples, l, maxd)
Xf += r
samples += r
n -= r


cdef inline void sift_down(
floating* Xf,
cnp.npy_intp* samples,
cnp.npy_intp start,
cnp.npy_intp end,
) nogil:
# Restore heap order in Xf[start:end] by moving the max element to start.
cdef cnp.npy_intp child, maxind, root

root = start
while True:
child = root * 2 + 1

# find max of root, left child, right child
maxind = root
if child < end and Xf[maxind] < Xf[child]:
maxind = child
if child + 1 < end and Xf[maxind] < Xf[child + 1]:
maxind = child + 1

if maxind == root:
break
else:
swap(Xf, samples, root, maxind)
root = maxind


cdef void heapsort(floating* Xf, cnp.npy_intp* samples, cnp.npy_intp n) nogil:
cdef cnp.npy_intp start, end

# heapify
start = (n - 2) / 2
end = n
while True:
sift_down(Xf, samples, start, end)
if start == 0:
break
start -= 1

# sort by shrinking the heap, putting the max element immediately after it
end = n - 1
while end > 0:
swap(Xf, samples, 0, end)
sift_down(Xf, samples, 0, end)
end = end - 1