diff --git a/.gitignore b/.gitignore index f4601a15655a5..76f8a60158209 100644 --- a/.gitignore +++ b/.gitignore @@ -99,6 +99,9 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx +sklearn/neighbors/_ball_tree.pyx +sklearn/neighbors/_binary_tree.pxi +sklearn/neighbors/_kd_tree.pyx # Default JupyterLite content jupyterlite_contents diff --git a/setup.cfg b/setup.cfg index 94ed59f539cb7..b7705781dbb7d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,6 +53,9 @@ ignore = sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx + sklearn/neighbors/_ball_tree.pyx + sklearn/neighbors/_binary_tree.pxi + sklearn/neighbors/_kd_tree.pyx [codespell] diff --git a/setup.py b/setup.py index 5af738f5f841f..f9ae13c94502b 100755 --- a/setup.py +++ b/setup.py @@ -306,8 +306,9 @@ def check_package_status(package, min_version): }, ], "neighbors": [ - {"sources": ["_ball_tree.pyx"], "include_np": True}, - {"sources": ["_kd_tree.pyx"], "include_np": True}, + {"sources": ["_binary_tree.pxi.tp"], "include_np": True}, + {"sources": ["_ball_tree.pyx.tp"], "include_np": True}, + {"sources": ["_kd_tree.pyx.tp"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, {"sources": ["_quad_tree.pyx"], "include_np": True}, ], @@ -499,13 +500,18 @@ def configure_extension_modules(): # `source` is a Tempita file tempita_sources.append(source) - # Do not include pxd files that were generated by tempita - if os.path.splitext(new_source_path)[-1] == ".pxd": - continue - sources.append(new_source_path) + # Only include source files that are pyx files + if os.path.splitext(new_source_path)[-1] == ".pyx": + sources.append(new_source_path) gen_from_templates(tempita_sources) + # Do not progress if we only have a tempita file which we don't + # want to include like the .pxi.tp extension. In such a case + # sources would be empty. + if not sources: + continue + # By convention, our extensions always use the name of the first source source_name = os.path.splitext(os.path.basename(sources[0]))[0] if submodule: diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx deleted file mode 100644 index d9b933cb43c66..0000000000000 --- a/sklearn/neighbors/_ball_tree.pyx +++ /dev/null @@ -1,195 +0,0 @@ -# Author: Jake Vanderplas -# License: BSD 3 clause - -__all__ = ['BallTree'] - -DOC_DICT = {'BinaryTree': 'BallTree', 'binary_tree': 'ball_tree'} - -VALID_METRICS = [ - 'BrayCurtisDistance64', - 'CanberraDistance64', - 'ChebyshevDistance64', - 'DiceDistance64', - 'EuclideanDistance64', - 'HammingDistance64', - 'HaversineDistance64', - 'JaccardDistance64', - 'MahalanobisDistance64', - 'ManhattanDistance64', - 'MinkowskiDistance64', - 'PyFuncDistance64', - 'RogersTanimotoDistance64', - 'RussellRaoDistance64', - 'SEuclideanDistance64', - 'SokalMichenerDistance64', - 'SokalSneathDistance64', - 'WMinkowskiDistance64', -] - -include "_binary_tree.pxi" - -# Inherit BallTree from BinaryTree -cdef class BallTree(BinaryTree): - __doc__ = CLASS_DOC.format(**DOC_DICT) - pass - - -# ---------------------------------------------------------------------- -# The functions below specialized the Binary Tree as a Ball Tree -# -# Note that these functions use the concept of "reduced distance". -# The reduced distance, defined for some metrics, is a quantity which -# is more efficient to compute than the distance, but preserves the -# relative rankings of the true distance. For example, the reduced -# distance for the Euclidean metric is the squared-euclidean distance. -# For some metrics, the reduced distance is simply the distance. - -cdef int allocate_data(BinaryTree tree, intp_t n_nodes, - intp_t n_features) except -1: - """Allocate arrays needed for the KD Tree""" - tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype=np.float64) - return 0 - - -cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node, - intp_t idx_start, intp_t idx_end) except -1: - """Initialize the node for the dataset stored in tree.data""" - cdef intp_t n_features = tree.data.shape[1] - cdef intp_t n_points = idx_end - idx_start - - cdef intp_t i, j - cdef float64_t radius - cdef float64_t *this_pt - - cdef intp_t* idx_array = &tree.idx_array[0] - cdef float64_t* data = &tree.data[0, 0] - cdef float64_t* centroid = &tree.node_bounds[0, i_node, 0] - - cdef bint with_sample_weight = tree.sample_weight is not None - cdef float64_t* sample_weight - cdef float64_t sum_weight_node - if with_sample_weight: - sample_weight = &tree.sample_weight[0] - - # determine Node centroid - for j in range(n_features): - centroid[j] = 0 - - if with_sample_weight: - sum_weight_node = 0 - for i in range(idx_start, idx_end): - sum_weight_node += sample_weight[idx_array[i]] - this_pt = data + n_features * idx_array[i] - for j from 0 <= j < n_features: - centroid[j] += this_pt[j] * sample_weight[idx_array[i]] - - for j in range(n_features): - centroid[j] /= sum_weight_node - else: - for i in range(idx_start, idx_end): - this_pt = data + n_features * idx_array[i] - for j from 0 <= j < n_features: - centroid[j] += this_pt[j] - - for j in range(n_features): - centroid[j] /= n_points - - # determine Node radius - radius = 0 - for i in range(idx_start, idx_end): - radius = fmax(radius, - tree.rdist(centroid, - data + n_features * idx_array[i], - n_features)) - - node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) - node_data[i_node].idx_start = idx_start - node_data[i_node].idx_end = idx_end - return 0 - - -cdef inline float64_t min_dist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1 nogil: - """Compute the minimum distance between a point and a node""" - cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - return fmax(0, dist_pt - tree.node_data[i_node].radius) - - -cdef inline float64_t max_dist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1: - """Compute the maximum distance between a point and a node""" - cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - return dist_pt + tree.node_data[i_node].radius - - -cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, - float64_t* min_dist, float64_t* max_dist) except -1 nogil: - """Compute the minimum and maximum distance between a point and a node""" - cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - cdef float64_t rad = tree.node_data[i_node].radius - min_dist[0] = fmax(0, dist_pt - rad) - max_dist[0] = dist_pt + rad - return 0 - - -cdef inline float64_t min_rdist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1 nogil: - """Compute the minimum reduced-distance between a point and a node""" - if tree.euclidean: - return euclidean_dist_to_rdist64(min_dist(tree, i_node, pt)) - else: - return tree.dist_metric._dist_to_rdist(min_dist(tree, i_node, pt)) - - -cdef inline float64_t max_rdist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1: - """Compute the maximum reduced-distance between a point and a node""" - if tree.euclidean: - return euclidean_dist_to_rdist64(max_dist(tree, i_node, pt)) - else: - return tree.dist_metric._dist_to_rdist(max_dist(tree, i_node, pt)) - - -cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the minimum distance between two nodes""" - cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], - &tree1.node_bounds[0, i_node1, 0], - tree1.data.shape[1]) - return fmax(0, (dist_pt - tree1.node_data[i_node1].radius - - tree2.node_data[i_node2].radius)) - - -cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the maximum distance between two nodes""" - cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], - &tree1.node_bounds[0, i_node1, 0], - tree1.data.shape[1]) - return (dist_pt + tree1.node_data[i_node1].radius - + tree2.node_data[i_node2].radius) - - -cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the minimum reduced distance between two nodes""" - if tree1.euclidean: - return euclidean_dist_to_rdist64(min_dist_dual(tree1, i_node1, - tree2, i_node2)) - else: - return tree1.dist_metric._dist_to_rdist(min_dist_dual(tree1, i_node1, - tree2, i_node2)) - - -cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the maximum reduced distance between two nodes""" - if tree1.euclidean: - return euclidean_dist_to_rdist64(max_dist_dual(tree1, i_node1, - tree2, i_node2)) - else: - return tree1.dist_metric._dist_to_rdist(max_dist_dual(tree1, i_node1, - tree2, i_node2)) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp new file mode 100644 index 0000000000000..92b26714e5d9f --- /dev/null +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -0,0 +1,284 @@ +{{py: + +# Generated file: _ball_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + +# Author: Jake Vanderplas +# License: BSD 3 clause + +}} + + +__all__ = ['BallTree', 'BallTree64', 'BallTree32'] + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'BallTree{{name_suffix}}', + 'binary_tree': 'ball_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'BrayCurtisDistance{{name_suffix}}', + 'CanberraDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'DiceDistance{{name_suffix}}', + 'EuclideanDistance{{name_suffix}}', + 'HammingDistance{{name_suffix}}', + 'HaversineDistance{{name_suffix}}', + 'JaccardDistance{{name_suffix}}', + 'MahalanobisDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}', + 'PyFuncDistance{{name_suffix}}', + 'RogersTanimotoDistance{{name_suffix}}', + 'RussellRaoDistance{{name_suffix}}', + 'SEuclideanDistance{{name_suffix}}', + 'SokalMichenerDistance{{name_suffix}}', + 'SokalSneathDistance{{name_suffix}}', + 'WMinkowskiDistance{{name_suffix}}', +] + +{{endfor}} + +include "_binary_tree.pxi" + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) + pass + +{{endfor}} + + +#---------------------------------------------------------------------- +# The functions below specialized the Binary Tree as a Ball Tree +# +# Note that these functions use the concept of "reduced distance". +# The reduced distance, defined for some metrics, is a quantity which +# is more efficient to compute than the distance, but preserves the +# relative rankings of the true distance. For example, the reduced +# distance for the Euclidean metric is the squared-euclidean distance. +# For some metrics, the reduced distance is simply the distance. + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t n_nodes, + intp_t n_features, +) except -1: + """Allocate arrays needed for the KD Tree""" + tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype={{INPUT_DTYPE}}) + return 0 + + +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + intp_t i_node, + intp_t idx_start, + intp_t idx_end, +) except -1: + """Initialize the node for the dataset stored in tree.data""" + cdef intp_t n_features = tree.data.shape[1] + cdef intp_t n_points = idx_end - idx_start + + cdef intp_t i, j + cdef float64_t radius + cdef {{INPUT_DTYPE_t}} *this_pt + + cdef intp_t* idx_array = &tree.idx_array[0] + cdef {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] + cdef {{INPUT_DTYPE_t}}* centroid = &tree.node_bounds[0, i_node, 0] + + cdef bint with_sample_weight = tree.sample_weight is not None + cdef {{INPUT_DTYPE_t}}* sample_weight + cdef float64_t sum_weight_node + if with_sample_weight: + sample_weight = &tree.sample_weight[0] + + # determine Node centroid + for j in range(n_features): + centroid[j] = 0 + + if with_sample_weight: + sum_weight_node = 0 + for i in range(idx_start, idx_end): + sum_weight_node += sample_weight[idx_array[i]] + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] * sample_weight[idx_array[i]] + + for j in range(n_features): + centroid[j] /= sum_weight_node + else: + for i in range(idx_start, idx_end): + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] + + for j in range(n_features): + centroid[j] /= n_points + + # determine Node radius + radius = 0 + for i in range(idx_start, idx_end): + radius = fmax(radius, + tree.rdist(centroid, + data + n_features * idx_array[i], + n_features)) + + node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) + node_data[i_node].idx_start = idx_start + node_data[i_node].idx_end = idx_end + return 0 + + +cdef inline float64_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return fmax(0, dist_pt - tree.node_data[i_node].radius) + + +cdef inline float64_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return dist_pt + tree.node_data[i_node].radius + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t* min_dist, + float64_t* max_dist, +) except -1 nogil: + """Compute the minimum and maximum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + cdef float64_t rad = tree.node_data[i_node].radius + min_dist[0] = fmax(0, dist_pt - rad) + max_dist[0] = dist_pt + rad + return 0 + + +cdef inline float64_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline float64_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline float64_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the minimum distance between two nodes""" + cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return fmax(0, (dist_pt - tree1.node_data[i_node1].radius + - tree2.node_data[i_node2].radius)) + + +cdef inline float64_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the maximum distance between two nodes""" + cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return (dist_pt + tree1.node_data[i_node1].radius + + tree2.node_data[i_node2].radius) + + +cdef inline float64_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the minimum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + + +cdef inline float64_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the maximum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} + + +class BallTree(BallTree64): + __doc__ = CLASS_DOC.format(BinaryTree="BallTree") + pass diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi.tp similarity index 90% rename from sklearn/neighbors/_binary_tree.pxi rename to sklearn/neighbors/_binary_tree.pxi.tp index b60ea3a0a6d70..6322f809f7eb9 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -1,14 +1,32 @@ -#!python +{{py: +# Generated file: _binary_tree.pxi + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE + # + ('64', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'), + ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT') +] # KD Tree and Ball Tree # ===================== # # Author: Jake Vanderplas , 2012-2013 +# Omar Salman +# # License: BSD # -# This file is meant to be a literal include in a pyx file. -# See ball_tree.pyx and kd_tree.pyx +# _binary_tree.pxi is generated and is then literally Cython included in +# ball_tree.pyx and kd_tree.pyx. See ball_tree.pyx.tp and kd_tree.pyx.tp. + +}} + + +# KD Tree and Ball Tree +# ===================== # # The routines here are the core algorithms of the KDTree and BallTree # structures. If Cython supported polymorphism, we would be able to @@ -143,6 +161,7 @@ # """Compute the maximum distance between two nodes""" cimport numpy as cnp +from cython cimport floating from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma from libc.math cimport fmin, fmax from libc.stdlib cimport calloc, malloc, free @@ -154,15 +173,19 @@ import warnings from ..metrics._dist_metrics cimport ( DistanceMetric, DistanceMetric64, + DistanceMetric32, euclidean_dist64, + euclidean_dist32, euclidean_rdist64, + euclidean_rdist32, euclidean_dist_to_rdist64, + euclidean_dist_to_rdist32, ) from ._partition_nodes cimport partition_node_indices from ..utils import check_array -from ..utils._typedefs cimport float64_t, intp_t +from ..utils._typedefs cimport float32_t, float64_t, intp_t from ..utils._heap cimport heap_push from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort @@ -500,8 +523,9 @@ def kernel_norm(h, d, kernel, return_log=False): else: return np.exp(result) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} -cdef class NeighborsHeap: +cdef class NeighborsHeap{{name_suffix}}: """A max-heap structure to keep track of distances/indices of neighbors This implements an efficient pre-allocated set of fixed-size heaps @@ -516,19 +540,19 @@ cdef class NeighborsHeap: n_nbrs : int the size of each heap. """ - cdef float64_t[:, ::1] distances + cdef {{INPUT_DTYPE_t}}[:, ::1] distances cdef intp_t[:, ::1] indices def __cinit__(self): # One-element arrays are used as placeholders to prevent # any problem due to potential access to those attributes # (e.g. assigning to NULL or a to value in another segment). - self.distances = np.zeros((1, 1), dtype=np.float64, order='C') + self.distances = np.zeros((1, 1), dtype={{INPUT_DTYPE}}, order='C') self.indices = np.zeros((1, 1), dtype=np.intp, order='C') def __init__(self, n_pts, n_nbrs): self.distances = np.full( - (n_pts, n_nbrs), np.inf, dtype=np.float64, order='C' + (n_pts, n_nbrs), np.inf, dtype={{INPUT_DTYPE}}, order='C' ) self.indices = np.zeros((n_pts, n_nbrs), dtype=np.intp, order='C') @@ -571,14 +595,16 @@ cdef class NeighborsHeap: ) return 0 -# ------------------------------------------------------------ +{{endfor}} + +#------------------------------------------------------------ # find_node_split_dim: # this computes the equivalent of # j_max = np.argmax(np.max(data, 0) - np.min(data, 0)) -cdef intp_t find_node_split_dim(float64_t* data, - intp_t* node_indices, - intp_t n_features, - intp_t n_points) except -1: +cdef intp_t find_node_split_dim(const floating* data, + intp_t* node_indices, + intp_t n_features, + intp_t n_points) except -1: """Find the dimension with the largest spread. Parameters @@ -764,29 +790,31 @@ def newObj(obj): return obj.__new__(obj) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} + ###################################################################### -# define the reverse mapping of VALID_METRICS +# define the reverse mapping of VALID_METRICS{{name_suffix}} from sklearn.metrics._dist_metrics import get_valid_metric_ids -VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS) +VALID_METRIC_IDS{{name_suffix}} = get_valid_metric_ids(VALID_METRICS{{name_suffix}}) ###################################################################### # Binary Tree class -cdef class BinaryTree: +cdef class BinaryTree{{name_suffix}}: - cdef readonly const float64_t[:, ::1] data - cdef readonly const float64_t[::1] sample_weight + cdef readonly const {{INPUT_DTYPE_t}}[:, ::1] data + cdef readonly const {{INPUT_DTYPE_t}}[::1] sample_weight cdef public float64_t sum_weight cdef public const intp_t[::1] idx_array cdef public const NodeData_t[::1] node_data - cdef public const float64_t[:, :, ::1] node_bounds + cdef public const {{INPUT_DTYPE_t}}[:, :, ::1] node_bounds cdef intp_t leaf_size cdef intp_t n_levels cdef intp_t n_nodes - cdef DistanceMetric64 dist_metric + cdef DistanceMetric{{name_suffix}} dist_metric cdef int euclidean # variables to keep track of building & querying stats @@ -795,7 +823,7 @@ cdef class BinaryTree: cdef int n_splits cdef int n_calls - valid_metrics = VALID_METRIC_IDS + valid_metrics = VALID_METRIC_IDS{{name_suffix}} # Use cinit to initialize all arrays to empty: this will prevent memory # errors and seg-faults in rare cases where __init__ is not called @@ -803,11 +831,11 @@ cdef class BinaryTree: # any problem due to potential access to this attribute # (e.g. assigning to NULL or a to value in another segment). def __cinit__(self): - self.data = np.empty((1, 1), dtype=np.float64, order='C') - self.sample_weight = np.empty(1, dtype=np.float64, order='C') + self.data = np.empty((1, 1), dtype={{INPUT_DTYPE}}, order='C') + self.sample_weight = np.empty(1, dtype={{INPUT_DTYPE}}, order='C') self.idx_array = np.empty(1, dtype=np.intp, order='C') self.node_data = np.empty(1, dtype=NodeData, order='C') - self.node_bounds = np.empty((1, 1, 1), dtype=np.float64) + self.node_bounds = np.empty((1, 1, 1), dtype={{INPUT_DTYPE}}) self.leaf_size = 0 self.n_levels = 0 @@ -823,7 +851,7 @@ cdef class BinaryTree: def __init__(self, data, leaf_size=40, metric='minkowski', sample_weight=None, **kwargs): # validate data - self.data = check_array(data, dtype=np.float64, order='C') + self.data = check_array(data, dtype={{INPUT_DTYPE}}, order='C') if self.data.size == 0: raise ValueError("X is an empty array") @@ -834,15 +862,15 @@ cdef class BinaryTree: raise ValueError("leaf_size must be greater than or equal to 1") self.leaf_size = leaf_size - self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) + self.dist_metric = DistanceMetric.get_metric(metric, dtype={{INPUT_DTYPE}}, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ - == 'EuclideanDistance64') + == 'EuclideanDistance{{name_suffix}}') metric = self.dist_metric.__class__.__name__ - if metric not in VALID_METRICS: + if metric not in VALID_METRICS{{name_suffix}}: raise ValueError('metric {metric} is not valid for ' '{BinaryTree}'.format(metric=metric, - **DOC_DICT)) + **DOC_DICT{{name_suffix}})) self.dist_metric._validate_data(self.data) # determine number of levels in the tree, and from this @@ -859,7 +887,7 @@ cdef class BinaryTree: self._update_sample_weight(n_samples, sample_weight) # Allocate tree-specific data - allocate_data(self, self.n_nodes, n_features) + allocate_data{{name_suffix}}(self, self.n_nodes, n_features) self._recursive_build( node_data=self.node_data.base, i_node=0, @@ -870,7 +898,7 @@ cdef class BinaryTree: def _update_sample_weight(self, n_samples, sample_weight): if sample_weight is not None: self.sample_weight = np.asarray( - sample_weight, dtype=np.float64, order='C') + sample_weight, dtype={{INPUT_DTYPE}}, order='C') self.sum_weight = np.sum(self.sample_weight) else: self.sample_weight = None @@ -982,17 +1010,17 @@ cdef class BinaryTree: self.node_bounds.base, ) - cdef inline float64_t dist(self, float64_t* x1, float64_t* x2, - intp_t size) except -1 nogil: + cdef inline float64_t dist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: """Compute the distance between arrays x1 and x2""" self.n_calls += 1 if self.euclidean: - return euclidean_dist64(x1, x2, size) + return euclidean_dist{{name_suffix}}(x1, x2, size) else: return self.dist_metric.dist(x1, x2, size) - cdef inline float64_t rdist(self, float64_t* x1, float64_t* x2, - intp_t size) except -1 nogil: + cdef inline float64_t rdist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: """Compute the reduced distance between arrays x1 and x2. The reduced distance, defined for some metrics, is a quantity which @@ -1002,7 +1030,7 @@ cdef class BinaryTree: """ self.n_calls += 1 if self.euclidean: - return euclidean_rdist64(x1, x2, size) + return euclidean_rdist{{name_suffix}}(x1, x2, size) else: return self.dist_metric.rdist(x1, x2, size) @@ -1023,10 +1051,10 @@ cdef class BinaryTree: cdef intp_t n_points = idx_end - idx_start cdef intp_t n_mid = n_points / 2 cdef intp_t* idx_array = &self.idx_array[idx_start] - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # initialize node data - init_node(self, node_data, i_node, idx_start, idx_end) + init_node{{name_suffix}}(self, node_data, i_node, idx_start, idx_end) if 2 * i_node + 1 >= self.n_nodes: node_data[i_node].is_leaf = True @@ -1103,7 +1131,7 @@ cdef class BinaryTree: corresponding point. """ # XXX: we should allow X to be a pre-built tree. - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " @@ -1115,13 +1143,13 @@ cdef class BinaryTree: # flatten X, and save original shape information np_Xarr = X.reshape((-1, self.data.shape[1])) - cdef const float64_t[:, ::1] Xarr = np_Xarr + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr cdef float64_t reduced_dist_LB cdef intp_t i - cdef float64_t* pt + cdef {{INPUT_DTYPE_t}}* pt # initialize heap for neighbors - cdef NeighborsHeap heap = NeighborsHeap(Xarr.shape[0], k) + cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(Xarr.shape[0], k) # node heap for breadth-first queries cdef NodeHeap nodeheap @@ -1141,7 +1169,7 @@ cdef class BinaryTree: if breadth_first: self._query_dual_breadthfirst(other, heap, nodeheap) else: - reduced_dist_LB = min_rdist_dual(self, 0, other, 0) + reduced_dist_LB = min_rdist_dual{{name_suffix}}(self, 0, other, 0) bounds = np.full(other.node_data.shape[0], np.inf) self._query_dual_depthfirst(0, other, 0, bounds, heap, reduced_dist_LB) @@ -1155,7 +1183,7 @@ cdef class BinaryTree: else: with nogil: for i in range(Xarr.shape[0]): - reduced_dist_LB = min_rdist(self, 0, pt) + reduced_dist_LB = min_rdist{{name_suffix}}(self, 0, pt) self._query_single_depthfirst(0, pt, i, heap, reduced_dist_LB) pt += Xarr.shape[1] @@ -1233,20 +1261,20 @@ cdef class BinaryTree: cdef intp_t i, count_i = 0 cdef intp_t n_features = self.data.shape[1] - cdef float64_t[::1] dist_arr_i + cdef {{INPUT_DTYPE_t}}[::1] dist_arr_i cdef intp_t[::1] idx_arr_i, counts - cdef float64_t* pt + cdef {{INPUT_DTYPE_t}}* pt cdef intp_t** indices = NULL - cdef float64_t** distances = NULL + cdef {{INPUT_DTYPE_t}}** distances = NULL # validate X and prepare for query - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " "match training data dimension") - cdef const float64_t[:, ::1] Xarr = X.reshape((-1, self.data.shape[1])) + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = X.reshape((-1, self.data.shape[1])) # prepare r for query r = np.asarray(r, dtype=np.float64, order='C') @@ -1265,7 +1293,7 @@ cdef class BinaryTree: if indices == NULL: raise MemoryError() if return_distance: - distances = calloc(Xarr.shape[0], sizeof(float64_t*)) + distances = <{{INPUT_DTYPE_t}}**>calloc(Xarr.shape[0], sizeof({{INPUT_DTYPE_t}}*)) if distances == NULL: free(indices) raise MemoryError() @@ -1273,7 +1301,7 @@ cdef class BinaryTree: np_idx_arr = np.zeros(self.data.shape[0], dtype=np.intp) idx_arr_i = np_idx_arr - np_dist_arr = np.zeros(self.data.shape[0], dtype=np.float64) + np_dist_arr = np.zeros(self.data.shape[0], dtype={{INPUT_DTYPE}}) dist_arr_i = np_dist_arr counts_arr = np.zeros(Xarr.shape[0], dtype=np.intp) @@ -1306,11 +1334,11 @@ cdef class BinaryTree: if return_distance: # equivalent to: distances[i] = np_dist_arr[:counts[i]].copy() - distances[i] = malloc(counts[i] * sizeof(float64_t)) + distances[i] = <{{INPUT_DTYPE_t}}*>malloc(counts[i] * sizeof({{INPUT_DTYPE_t}})) if distances[i] == NULL: memory_error = True break - memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof(float64_t)) + memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof({{INPUT_DTYPE_t}})) try: if memory_error: @@ -1333,7 +1361,7 @@ cdef class BinaryTree: # make a new numpy array that wraps the existing data # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 - distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_DOUBLE, distances[i]) + distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], {{NPY_TYPE}}, distances[i]) # make sure the data will be freed when the numpy array is garbage collected PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_ARRAY_OWNDATA) # make sure the data is not freed twice @@ -1445,18 +1473,18 @@ cdef class BinaryTree: cdef float64_t log_knorm = _log_kernel_norm(h_c, n_features, kernel_c) # validate X and prepare for query - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != n_features: raise ValueError("query data dimension must " "match training data dimension") Xarr_np = X.reshape((-1, n_features)) - cdef float64_t[:, ::1] Xarr = Xarr_np + cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = Xarr_np - log_density_arr = np.zeros(Xarr.shape[0], dtype=np.float64) - cdef float64_t[::1] log_density = log_density_arr + log_density_arr = np.zeros(Xarr.shape[0], dtype={{INPUT_DTYPE}}) + cdef {{INPUT_DTYPE_t}}[::1] log_density = log_density_arr - cdef float64_t* pt = &Xarr[0, 0] + cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] cdef NodeHeap nodeheap if breadth_first: @@ -1481,7 +1509,7 @@ cdef class BinaryTree: pt += n_features else: for i in range(Xarr.shape[0]): - min_max_dist(self, 0, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, 0, pt, &dist_LB, &dist_UB) # compute max & min bounds on density within top node log_min_bound = (log(self.sum_weight) + compute_log_kernel(dist_UB, @@ -1539,14 +1567,14 @@ cdef class BinaryTree: cdef intp_t i # validate X and prepare for query - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " "match training data dimension") np_Xarr = X.reshape((-1, self.data.shape[1])) - cdef float64_t[:, ::1] Xarr = np_Xarr + cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr # prepare r for query r = np.asarray(r, dtype=np.float64, order='C') @@ -1561,7 +1589,7 @@ cdef class BinaryTree: count = np.zeros(r.shape[0], dtype=np.intp) cdef intp_t[::1] carr = count - cdef float64_t* pt = &Xarr[0, 0] + cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] if dualtree: other = self.__class__(Xarr, metric=self.dist_metric, @@ -1576,17 +1604,21 @@ cdef class BinaryTree: return count - cdef int _query_single_depthfirst(self, intp_t i_node, - float64_t* pt, intp_t i_pt, - NeighborsHeap heap, - float64_t reduced_dist_LB) except -1 nogil: + cdef int _query_single_depthfirst( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1 nogil: """Recursive Single-tree k-neighbors query, depth-first approach""" cdef NodeData_t node_info = self.node_data[i_node] cdef float64_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2 cdef intp_t i, i1, i2 - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # ------------------------------------------------------------ # Case 1: query point is outside node radius: @@ -1611,8 +1643,8 @@ cdef class BinaryTree: self.n_splits += 1 i1 = 2 * i_node + 1 i2 = i1 + 1 - reduced_dist_LB_1 = min_rdist(self, i1, pt) - reduced_dist_LB_2 = min_rdist(self, i2, pt) + reduced_dist_LB_1 = min_rdist{{name_suffix}}(self, i1, pt) + reduced_dist_LB_2 = min_rdist{{name_suffix}}(self, i2, pt) # recursively query subnodes if reduced_dist_LB_1 <= reduced_dist_LB_2: @@ -1627,19 +1659,22 @@ cdef class BinaryTree: reduced_dist_LB_1) return 0 - cdef int _query_single_breadthfirst(self, float64_t* pt, - intp_t i_pt, - NeighborsHeap heap, - NodeHeap nodeheap) except -1: + cdef int _query_single_breadthfirst( + self, + {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: """Non-recursive single-tree k-neighbors query, breadth-first search""" cdef intp_t i, i_node cdef float64_t dist_pt, reduced_dist_LB cdef NodeData_t* node_data = &self.node_data[0] - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # Set up the node heap and push the head node onto it cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_rdist(self, 0, pt) + nodeheap_item.val = min_rdist{{name_suffix}}(self, 0, pt) nodeheap_item.i1 = 0 nodeheap.push(nodeheap_item) @@ -1672,15 +1707,19 @@ cdef class BinaryTree: self.n_splits += 1 for i in range(2 * i_node + 1, 2 * i_node + 3): nodeheap_item.i1 = i - nodeheap_item.val = min_rdist(self, i, pt) + nodeheap_item.val = min_rdist{{name_suffix}}(self, i, pt) nodeheap.push(nodeheap_item) return 0 - cdef int _query_dual_depthfirst(self, intp_t i_node1, - BinaryTree other, intp_t i_node2, - float64_t[::1] bounds, - NeighborsHeap heap, - float64_t reduced_dist_LB) except -1: + cdef int _query_dual_depthfirst( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t[::1] bounds, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1: """Recursive dual-tree k-neighbors query, depth-first""" # note that the array `bounds` is maintained such that # bounds[i] is the largest distance among any of the @@ -1688,8 +1727,8 @@ cdef class BinaryTree: cdef NodeData_t node_info1 = self.node_data[i_node1] cdef NodeData_t node_info2 = other.node_data[i_node2] - cdef float64_t* data1 = &self.data[0, 0] - cdef float64_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef intp_t n_features = self.data.shape[1] cdef float64_t bound_max, dist_pt, reduced_dist_LB1, reduced_dist_LB2 @@ -1740,9 +1779,9 @@ cdef class BinaryTree: # recursively query, starting with the nearest subnode elif node_info1.is_leaf or (not node_info2.is_leaf and node_info2.radius > node_info1.radius): - reduced_dist_LB1 = min_rdist_dual(self, i_node1, + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, i_node1, other, 2 * i_node2 + 1) - reduced_dist_LB2 = min_rdist_dual(self, i_node1, + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, i_node1, other, 2 * i_node2 + 2) if reduced_dist_LB1 < reduced_dist_LB2: @@ -1760,9 +1799,9 @@ cdef class BinaryTree: # Case 3b: node 2 is a leaf or is smaller: split node 1 and # recursively query, starting with the nearest subnode else: - reduced_dist_LB1 = min_rdist_dual(self, 2 * i_node1 + 1, + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 1, other, i_node2) - reduced_dist_LB2 = min_rdist_dual(self, 2 * i_node1 + 2, + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 2, other, i_node2) if reduced_dist_LB1 < reduced_dist_LB2: @@ -1777,9 +1816,12 @@ cdef class BinaryTree: bounds, heap, reduced_dist_LB1) return 0 - cdef int _query_dual_breadthfirst(self, BinaryTree other, - NeighborsHeap heap, - NodeHeap nodeheap) except -1: + cdef int _query_dual_breadthfirst( + self, + BinaryTree{{name_suffix}} other, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: """Non-recursive dual-tree k-neighbors query, breadth-first""" cdef intp_t i, i1, i2, i_node1, i_node2, i_pt cdef float64_t dist_pt, reduced_dist_LB @@ -1787,13 +1829,13 @@ cdef class BinaryTree: cdef NodeData_t* node_data1 = &self.node_data[0] cdef NodeData_t* node_data2 = &other.node_data[0] cdef NodeData_t node_info1, node_info2 - cdef float64_t* data1 = &self.data[0, 0] - cdef float64_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef intp_t n_features = self.data.shape[1] # Set up the node heap and push the head nodes onto it cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_rdist_dual(self, 0, other, 0) + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, 0, other, 0) nodeheap_item.i1 = 0 nodeheap_item.i2 = 0 nodeheap.push(nodeheap_item) @@ -1845,7 +1887,7 @@ cdef class BinaryTree: nodeheap_item.i1 = i_node1 for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): nodeheap_item.i2 = i2 - nodeheap_item.val = min_rdist_dual(self, i_node1, + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i_node1, other, i2) nodeheap.push(nodeheap_item) @@ -1856,21 +1898,24 @@ cdef class BinaryTree: nodeheap_item.i2 = i_node2 for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): nodeheap_item.i1 = i1 - nodeheap_item.val = min_rdist_dual(self, i1, + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i1, other, i_node2) nodeheap.push(nodeheap_item) return 0 - cdef intp_t _query_radius_single(self, - intp_t i_node, - float64_t* pt, float64_t r, - intp_t* indices, - float64_t* distances, - intp_t count, - int count_only, - int return_distance) noexcept nogil: + cdef intp_t _query_radius_single( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t r, + intp_t* indices, + {{INPUT_DTYPE_t}}* distances, + intp_t count, + int count_only, + int return_distance, + ) noexcept nogil: """recursive single-tree radius query, depth-first""" - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef intp_t* idx_array = &self.idx_array[0] cdef intp_t n_features = self.data.shape[1] cdef NodeData_t node_info = self.node_data[i_node] @@ -1879,7 +1924,7 @@ cdef class BinaryTree: cdef float64_t reduced_r cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 - min_max_dist(self, i_node, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) # ------------------------------------------------------------ # Case 1: all node points are outside distance r. @@ -1937,13 +1982,17 @@ cdef class BinaryTree: return count - cdef float64_t _kde_single_breadthfirst(self, float64_t* pt, - KernelType kernel, float64_t h, - float64_t log_knorm, - float64_t log_atol, float64_t log_rtol, - NodeHeap nodeheap, - float64_t* node_log_min_bounds, - float64_t* node_log_bound_spreads): + cdef float64_t _kde_single_breadthfirst( + self, {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + NodeHeap nodeheap, + float64_t* node_log_min_bounds, + float64_t* node_log_bound_spreads, + ): """non-recursive single-tree kernel density estimation""" # For the given point, node_log_min_bounds and node_log_bound_spreads # will encode the current bounds on the density between the point @@ -1957,9 +2006,9 @@ cdef class BinaryTree: cdef float64_t global_log_min_bound, global_log_bound_spread cdef float64_t global_log_max_bound - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef bint with_sample_weight = self.sample_weight is not None - cdef float64_t* sample_weight + cdef {{INPUT_DTYPE_t}}* sample_weight if with_sample_weight: sample_weight = &self.sample_weight[0] cdef intp_t* idx_array = &self.idx_array[0] @@ -1981,13 +2030,13 @@ cdef class BinaryTree: # push the top node to the heap cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_dist(self, 0, pt) + nodeheap_item.val = min_dist{{name_suffix}}(self, 0, pt) nodeheap_item.i1 = 0 nodeheap.push(nodeheap_item) - global_log_min_bound = log(N) + compute_log_kernel(max_dist(self, - 0, pt), - h, kernel) + global_log_min_bound = log(N) + compute_log_kernel( + max_dist{{name_suffix}}(self, 0, pt), h, kernel + ) global_log_max_bound = log(N) + compute_log_kernel(nodeheap_item.val, h, kernel) global_log_bound_spread = logsubexp(global_log_max_bound, @@ -2056,8 +2105,8 @@ cdef class BinaryTree: N1 = node_data[i1].idx_end - node_data[i1].idx_start N2 = node_data[i2].idx_end - node_data[i2].idx_start - min_max_dist(self, i1, pt, &dist_LB_1, &dist_UB_1) - min_max_dist(self, i2, pt, &dist_LB_2, &dist_UB_2) + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB_1, &dist_UB_1) + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB_2, &dist_UB_2) node_log_min_bounds[i1] = (log(N1) + compute_log_kernel(dist_UB_1, @@ -2102,14 +2151,19 @@ cdef class BinaryTree: global_log_bound_spread - log(2)) cdef int _kde_single_depthfirst( - self, intp_t i_node, float64_t* pt, - KernelType kernel, float64_t h, - float64_t log_knorm, - float64_t log_atol, float64_t log_rtol, - float64_t local_log_min_bound, - float64_t local_log_bound_spread, - float64_t* global_log_min_bound, - float64_t* global_log_bound_spread) except -1: + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + float64_t local_log_min_bound, + float64_t local_log_bound_spread, + float64_t* global_log_min_bound, + float64_t* global_log_bound_spread, + ) except -1: """recursive single-tree kernel density estimate, depth-first""" # For the given point, local_min_bound and local_max_bound give the # minimum and maximum density for the current node, while @@ -2119,10 +2173,10 @@ cdef class BinaryTree: cdef intp_t i, i1, i2, iw, start, end cdef float64_t N1, N2 - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef NodeData_t* node_data = &self.node_data[0] cdef bint with_sample_weight = self.sample_weight is not None - cdef float64_t* sample_weight + cdef {{INPUT_DTYPE_t}}* sample_weight cdef float64_t log_weight if with_sample_weight: sample_weight = &self.sample_weight[0] @@ -2194,7 +2248,7 @@ cdef class BinaryTree: N1 = (self.node_data[i1].idx_end - self.node_data[i1].idx_start) N2 = (self.node_data[i2].idx_end - self.node_data[i2].idx_start) - min_max_dist(self, i1, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB, &dist_UB) child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h, kernel) child1_log_bound_spread = logsubexp(log(N1) + @@ -2202,7 +2256,7 @@ cdef class BinaryTree: kernel), child1_log_min_bound) - min_max_dist(self, i2, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB, &dist_UB) child2_log_min_bound = log(N2) + compute_log_kernel(dist_UB, h, kernel) child2_log_bound_spread = logsubexp(log(N2) + @@ -2238,11 +2292,17 @@ cdef class BinaryTree: global_log_bound_spread) return 0 - cdef int _two_point_single(self, intp_t i_node, float64_t* pt, float64_t* r, - intp_t* count, intp_t i_min, - intp_t i_max) except -1: + cdef int _two_point_single( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: """recursive single-tree two-point correlation function query""" - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef intp_t* idx_array = &self.idx_array[0] cdef intp_t n_features = self.data.shape[1] cdef NodeData_t node_info = self.node_data[i_node] @@ -2251,7 +2311,7 @@ cdef class BinaryTree: cdef float64_t reduced_r cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 - min_max_dist(self, i_node, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) # ------------------------------------------------------------ # Go through bounds and check for cuts @@ -2287,13 +2347,19 @@ cdef class BinaryTree: count, i_min, i_max) return 0 - cdef int _two_point_dual(self, intp_t i_node1, - BinaryTree other, intp_t i_node2, - float64_t* r, intp_t* count, - intp_t i_min, intp_t i_max) except -1: + cdef int _two_point_dual( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: """recursive dual-tree two-point correlation function query""" - cdef float64_t* data1 = &self.data[0, 0] - cdef float64_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef intp_t* idx_array1 = &self.idx_array[0] cdef intp_t* idx_array2 = &other.idx_array[0] cdef NodeData_t node_info1 = self.node_data[i_node1] @@ -2305,8 +2371,8 @@ cdef class BinaryTree: cdef float64_t reduced_r cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 - dist_LB = min_dist_dual(self, i_node1, other, i_node2) - dist_UB = max_dist_dual(self, i_node1, other, i_node2) + dist_LB = min_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) + dist_UB = max_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) # ------------------------------------------------------------ # Go through bounds and check for cuts @@ -2359,21 +2425,11 @@ cdef class BinaryTree: r, count, i_min, i_max) return 0 +{{endfor}} ###################################################################### # Python functions for benchmarking and testing C implementations -def load_heap(float64_t[:, ::1] X, intp_t k): - """test fully loading the heap""" - assert k <= X.shape[1] - cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) - cdef intp_t i, j - for i in range(X.shape[0]): - for j in range(X.shape[1]): - heap._push(i, X[i, j], j) - return heap.get_arrays() - - def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): """In-place simultaneous sort the given row of the arrays @@ -2412,10 +2468,12 @@ def nodeheap_sort(float64_t[::1] vals): return np.asarray(vals_sorted), np.asarray(indices) -cdef inline float64_t _total_node_weight(NodeData_t* node_data, - float64_t* sample_weight, - intp_t* idx_array, - intp_t i_node): +cdef inline float64_t _total_node_weight( + NodeData_t* node_data, + const floating* sample_weight, + intp_t* idx_array, + intp_t i_node, +): cdef intp_t i cdef float64_t N = 0.0 for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end): diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx.tp similarity index 65% rename from sklearn/neighbors/_kd_tree.pyx rename to sklearn/neighbors/_kd_tree.pyx.tp index f5cd2617be147..1006ec2a8398c 100644 --- a/sklearn/neighbors/_kd_tree.pyx +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -1,22 +1,52 @@ +{{py: + +# Generated file: _kd_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + # By Jake Vanderplas (2013) # written for the scikit-learn project # License: BSD -__all__ = ['KDTree'] +}} + -DOC_DICT = {'BinaryTree': 'KDTree', 'binary_tree': 'kd_tree'} +__all__ = ['KDTree', 'KDTree64', 'KDTree32'] -VALID_METRICS = ['EuclideanDistance64', 'ManhattanDistance64', - 'ChebyshevDistance64', 'MinkowskiDistance64'] +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'KDTree{{name_suffix}}', + 'binary_tree': 'kd_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'EuclideanDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}' +] + +{{endfor}} include "_binary_tree.pxi" -# Inherit KDTree from BinaryTree -cdef class KDTree(BinaryTree): - __doc__ = CLASS_DOC.format(**DOC_DICT) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) pass +{{endfor}} + # ---------------------------------------------------------------------- # The functions below specialized the Binary Tree as a KD Tree @@ -28,27 +58,36 @@ cdef class KDTree(BinaryTree): # distance for the Euclidean metric is the squared-euclidean distance. # For some metrics, the reduced distance is simply the distance. +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} -cdef int allocate_data(BinaryTree tree, intp_t n_nodes, - intp_t n_features) except -1: +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t n_nodes, + intp_t n_features, +) except -1: """Allocate arrays needed for the KD Tree""" - tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype=np.float64) + tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype={{INPUT_DTYPE}}) return 0 -cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node, - intp_t idx_start, intp_t idx_end) except -1: +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + intp_t i_node, + intp_t idx_start, + intp_t idx_end, +) except -1: """Initialize the node for the dataset stored in tree.data""" cdef intp_t n_features = tree.data.shape[1] cdef intp_t i, j cdef float64_t rad = 0 - cdef float64_t* lower_bounds = &tree.node_bounds[0, i_node, 0] - cdef float64_t* upper_bounds = &tree.node_bounds[1, i_node, 0] - cdef float64_t* data = &tree.data[0, 0] + cdef {{INPUT_DTYPE_t}}* lower_bounds = &tree.node_bounds[0, i_node, 0] + cdef {{INPUT_DTYPE_t}}* upper_bounds = &tree.node_bounds[1, i_node, 0] + cdef {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] cdef intp_t* idx_array = &tree.idx_array[0] - cdef float64_t* data_row + cdef {{INPUT_DTYPE_t}}* data_row # determine Node bounds for j in range(n_features): @@ -81,8 +120,11 @@ cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node, return 0 -cdef float64_t min_rdist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1 nogil: +cdef float64_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: """Compute the minimum reduced-distance between a point and a node""" cdef intp_t n_features = tree.data.shape[1] cdef float64_t d, d_lo, d_hi, rdist=0.0 @@ -105,16 +147,26 @@ cdef float64_t min_rdist(BinaryTree tree, intp_t i_node, return rdist -cdef float64_t min_dist(BinaryTree tree, intp_t i_node, float64_t* pt) except -1: +cdef float64_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the minimum distance between a point and a node""" if tree.dist_metric.p == INF: - return min_rdist(tree, i_node, pt) + return min_rdist{{name_suffix}}(tree, i_node, pt) else: - return pow(min_rdist(tree, i_node, pt), 1. / tree.dist_metric.p) + return pow( + min_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) -cdef float64_t max_rdist(BinaryTree tree, - intp_t i_node, float64_t* pt) except -1: +cdef float64_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the maximum reduced-distance between a point and a node""" cdef intp_t n_features = tree.data.shape[1] @@ -134,16 +186,28 @@ cdef float64_t max_rdist(BinaryTree tree, return rdist -cdef float64_t max_dist(BinaryTree tree, intp_t i_node, float64_t* pt) except -1: +cdef float64_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the maximum distance between a point and a node""" if tree.dist_metric.p == INF: - return max_rdist(tree, i_node, pt) + return max_rdist{{name_suffix}}(tree, i_node, pt) else: - return pow(max_rdist(tree, i_node, pt), 1. / tree.dist_metric.p) - - -cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, - float64_t* min_dist, float64_t* max_dist) except -1 nogil: + return pow( + max_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t* min_dist, + float64_t* max_dist, +) except -1 nogil: """Compute the minimum and maximum distance between a point and a node""" cdef intp_t n_features = tree.data.shape[1] @@ -177,8 +241,12 @@ cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, return 0 -cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the minimum reduced distance between two nodes""" cdef intp_t n_features = tree1.data.shape[1] @@ -208,15 +276,24 @@ cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, return rdist -cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the minimum distance between two nodes""" - return tree1.dist_metric._rdist_to_dist(min_rdist_dual(tree1, i_node1, - tree2, i_node2)) + return tree1.dist_metric._rdist_to_dist( + min_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) -cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the maximum reduced distance between two nodes""" cdef intp_t n_features = tree1.data.shape[1] @@ -240,8 +317,20 @@ cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, return rdist -cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the maximum distance between two nodes""" - return tree1.dist_metric._rdist_to_dist(max_rdist_dual(tree1, i_node1, - tree2, i_node2)) + return tree1.dist_metric._rdist_to_dist( + max_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} + + +class KDTree(KDTree64): + __doc__ = CLASS_DOC.format(BinaryTree="KDTree") + pass diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd index 927fde873ee58..c6a0d4bb975c2 100644 --- a/sklearn/neighbors/_partition_nodes.pxd +++ b/sklearn/neighbors/_partition_nodes.pxd @@ -1,7 +1,8 @@ +from cython cimport floating from ..utils._typedefs cimport float64_t, intp_t cdef int partition_node_indices( - float64_t *data, + floating *data, intp_t *node_indices, intp_t split_dim, intp_t split_index, diff --git a/sklearn/neighbors/_partition_nodes.pyx b/sklearn/neighbors/_partition_nodes.pyx index d293b765ea279..011b024fccb14 100644 --- a/sklearn/neighbors/_partition_nodes.pyx +++ b/sklearn/neighbors/_partition_nodes.pyx @@ -16,6 +16,8 @@ # - https://en.cppreference.com/w/cpp/algorithm/nth_element. # - https://github.com/scikit-learn/scikit-learn/pull/11103 # - https://github.com/scikit-learn/scikit-learn/pull/19473 +from cython cimport floating + cdef extern from *: """ @@ -63,7 +65,7 @@ cdef extern from *: cdef int partition_node_indices( - float64_t *data, + floating *data, intp_t *node_indices, intp_t split_dim, intp_t split_index, diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index efca4e491ce01..5263f201f320b 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -2,9 +2,9 @@ import numpy as np import pytest -from numpy.testing import assert_array_almost_equal +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal -from sklearn.neighbors._ball_tree import BallTree +from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64 from sklearn.utils import check_random_state from sklearn.utils._testing import _convert_container from sklearn.utils.validation import check_array @@ -15,6 +15,13 @@ DIMENSION = 3 +METRICS = { + "euclidean": {}, + "manhattan": {}, + "minkowski": dict(p=3), + "chebyshev": {}, +} + DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"] BOOLEAN_METRICS = [ @@ -26,6 +33,11 @@ "sokalsneath", ] +BALL_TREE_CLASSES = [ + BallTree64, + BallTree32, +] + def brute_force_neighbors(X, Y, k, metric, **kwargs): from sklearn.metrics import DistanceMetric @@ -37,9 +49,14 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): return dist, ind +def test_BallTree_is_BallTree64_subclass(): + assert issubclass(BallTree, BallTree64) + + @pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) @pytest.mark.parametrize("array_type", ["list", "array"]) -def test_ball_tree_query_metrics(metric, array_type): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) @@ -52,31 +69,36 @@ def test_ball_tree_query_metrics(metric, array_type): k = 5 - bt = BallTree(X, leaf_size=1, metric=metric) + bt = BallTreeImplementation(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2) -def test_query_haversine(): +@pytest.mark.parametrize( + "BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5]) +) +def test_query_haversine(BallTreeImplementation, decimal_tol): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) - bt = BallTree(X, leaf_size=1, metric="haversine") + bt = BallTreeImplementation(X, leaf_size=1, metric="haversine") dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") - assert_array_almost_equal(dist1, dist2) + assert_array_almost_equal(dist1, dist2, decimal=decimal_tol) assert_array_almost_equal(ind1, ind2) -def test_array_object_type(): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_array_object_type(BallTreeImplementation): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - BallTree(X) + BallTreeImplementation(X) -def test_bad_pyfunc_metric(): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_bad_pyfunc_metric(BallTreeImplementation): def wrong_returned_value(x, y): return "1" @@ -86,8 +108,93 @@ def one_arg_func(x): X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): - BallTree(X, metric=wrong_returned_value) + BallTreeImplementation(X, metric=wrong_returned_value) msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): - BallTree(X, metric=one_arg_func) + BallTreeImplementation(X, metric=one_arg_func) + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_ball_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 + ) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 5 + dist_64, ind_64 = bt_64.query(Y_64, k=k) + dist_32, ind_32 = bt_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64 = bt_64.query_radius(Y_64, r=r) + ind_32 = bt_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 + + +def test_two_point_correlation_numerical_consistency(global_random_seed): + # Test consistency with respect to the `two_point_correlation` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + bt_64 = BallTree64(X_64, leaf_size=10) + bt_32 = BallTree32(X_32, leaf_size=10) + + r = np.linspace(0, 1, 10) + + counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True) + counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True) + assert_allclose(counts_64, counts_32) + + +def get_dataset_for_binary_tree(random_seed, features=3): + rng = np.random.RandomState(random_seed) + _X = rng.rand(100, features) + _Y = rng.rand(5, features) + + X_64 = _X.astype(dtype=np.float64, copy=False) + Y_64 = _Y.astype(dtype=np.float64, copy=False) + + X_32 = _X.astype(dtype=np.float32, copy=False) + Y_32 = _Y.astype(dtype=np.float32, copy=False) + + return X_64, X_32, Y_64, Y_32 diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 1aee28cc36bd0..749601baaf66f 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -1,30 +1,100 @@ import numpy as np import pytest +from numpy.testing import assert_allclose, assert_equal -from sklearn.neighbors._kd_tree import KDTree +from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64 +from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree from sklearn.utils.parallel import Parallel, delayed DIMENSION = 3 METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)} +KD_TREE_CLASSES = [ + KDTree64, + KDTree32, +] -def test_array_object_type(): + +def test_KDTree_is_KDTree64_subclass(): + assert issubclass(KDTree, KDTree64) + + +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_array_object_type(BinarySearchTree): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - KDTree(X) + BinarySearchTree(X) -def test_kdtree_picklable_with_joblib(): +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_kdtree_picklable_with_joblib(BinarySearchTree): """Make sure that KDTree queries work when joblib memmaps. Non-regression test for #21685 and #21228.""" rng = np.random.RandomState(0) X = rng.random_sample((10, 3)) - tree = KDTree(X, leaf_size=2) + tree = BinarySearchTree(X, leaf_size=2) # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that # use to raise "ValueError: buffer source array is read-only" in a previous # version of the Cython code. Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X]) + + +@pytest.mark.parametrize("metric", METRICS) +def test_kd_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 + ) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 4 + dist_64, ind_64 = kd_64.query(Y_64, k=k) + dist_32, ind_32 = kd_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64 = kd_64.query_radius(Y_64, r=r) + ind_32 = kd_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 + + +@pytest.mark.parametrize("metric", METRICS) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index 590e72ab785d2..4d8bac12f7423 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -13,7 +13,7 @@ kernel_norm, ) from sklearn.neighbors._ball_tree import ( - NeighborsHeap as NeighborsHeapBT, + NeighborsHeap64 as NeighborsHeapBT, ) from sklearn.neighbors._ball_tree import ( nodeheap_sort as nodeheap_sort_bt, @@ -25,7 +25,7 @@ KDTree, ) from sklearn.neighbors._kd_tree import ( - NeighborsHeap as NeighborsHeapKDT, + NeighborsHeap64 as NeighborsHeapKDT, ) from sklearn.neighbors._kd_tree import ( nodeheap_sort as nodeheap_sort_kdt,