From 3cd705bc4b1bfdd5c4a53467e0f041d733ef5aac Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 17 Mar 2023 16:45:19 +0500 Subject: [PATCH 01/36] ENH add float32 support in BallTree and KDTree --- .gitignore | 3 + setup.py | 5 +- sklearn/neighbors/_ball_tree.pyx | 186 ------------ sklearn/neighbors/_ball_tree.pyx.tp | 285 ++++++++++++++++++ .../{_binary_tree.pxi => _binary_tree.pxi.tp} | 203 ++++++++----- .../{_kd_tree.pyx => _kd_tree.pyx.tp} | 160 +++++++--- sklearn/neighbors/_partition_nodes.pxd | 3 +- sklearn/neighbors/_partition_nodes.pyx | 2 +- 8 files changed, 536 insertions(+), 311 deletions(-) delete mode 100644 sklearn/neighbors/_ball_tree.pyx create mode 100644 sklearn/neighbors/_ball_tree.pyx.tp rename sklearn/neighbors/{_binary_tree.pxi => _binary_tree.pxi.tp} (93%) rename sklearn/neighbors/{_kd_tree.pyx => _kd_tree.pyx.tp} (66%) diff --git a/.gitignore b/.gitignore index 89600846100a8..4a7e0694204d9 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,6 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx +sklearn/neighbors/_ball_tree.pyx +sklearn/neighbors/_binary_tree.pxi +sklearn/neighbors/_kd_tree.pyx diff --git a/setup.py b/setup.py index f5522600f623f..ad71b61a1d949 100755 --- a/setup.py +++ b/setup.py @@ -303,8 +303,9 @@ def check_package_status(package, min_version): }, ], "neighbors": [ - {"sources": ["_ball_tree.pyx"], "include_np": True}, - {"sources": ["_kd_tree.pyx"], "include_np": True}, + {"sources": ["_binary_tree.pxi.tp"], "include_np": True}, + {"sources": ["_ball_tree.pyx.tp"], "include_np": True}, + {"sources": ["_kd_tree.pyx.tp"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, {"sources": ["_quad_tree.pyx"], "include_np": True}, ], diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx deleted file mode 100644 index 30b8376be9146..0000000000000 --- a/sklearn/neighbors/_ball_tree.pyx +++ /dev/null @@ -1,186 +0,0 @@ -# Author: Jake Vanderplas -# License: BSD 3 clause - -__all__ = ['BallTree'] - -DOC_DICT = {'BinaryTree': 'BallTree', 'binary_tree': 'ball_tree'} - -VALID_METRICS = ['EuclideanDistance', 'SEuclideanDistance', - 'ManhattanDistance', 'ChebyshevDistance', - 'MinkowskiDistance', 'WMinkowskiDistance', - 'MahalanobisDistance', 'HammingDistance', - 'CanberraDistance', 'BrayCurtisDistance', - 'JaccardDistance', 'MatchingDistance', - 'DiceDistance', - 'RogersTanimotoDistance', 'RussellRaoDistance', - 'SokalMichenerDistance', 'SokalSneathDistance', - 'PyFuncDistance', 'HaversineDistance'] - - -include "_binary_tree.pxi" - -# Inherit BallTree from BinaryTree -cdef class BallTree(BinaryTree): - __doc__ = CLASS_DOC.format(**DOC_DICT) - pass - - -#---------------------------------------------------------------------- -# The functions below specialized the Binary Tree as a Ball Tree -# -# Note that these functions use the concept of "reduced distance". -# The reduced distance, defined for some metrics, is a quantity which -# is more efficient to compute than the distance, but preserves the -# relative rankings of the true distance. For example, the reduced -# distance for the Euclidean metric is the squared-euclidean distance. -# For some metrics, the reduced distance is simply the distance. - -cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes, - ITYPE_t n_features) except -1: - """Allocate arrays needed for the KD Tree""" - tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype=DTYPE) - return 0 - - -cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, ITYPE_t i_node, - ITYPE_t idx_start, ITYPE_t idx_end) except -1: - """Initialize the node for the dataset stored in tree.data""" - cdef ITYPE_t n_features = tree.data.shape[1] - cdef ITYPE_t n_points = idx_end - idx_start - - cdef ITYPE_t i, j - cdef DTYPE_t radius - cdef DTYPE_t *this_pt - - cdef ITYPE_t* idx_array = &tree.idx_array[0] - cdef DTYPE_t* data = &tree.data[0, 0] - cdef DTYPE_t* centroid = &tree.node_bounds[0, i_node, 0] - - cdef bint with_sample_weight = tree.sample_weight is not None - cdef DTYPE_t* sample_weight - cdef DTYPE_t sum_weight_node - if with_sample_weight: - sample_weight = &tree.sample_weight[0] - - # determine Node centroid - for j in range(n_features): - centroid[j] = 0 - - if with_sample_weight: - sum_weight_node = 0 - for i in range(idx_start, idx_end): - sum_weight_node += sample_weight[idx_array[i]] - this_pt = data + n_features * idx_array[i] - for j from 0 <= j < n_features: - centroid[j] += this_pt[j] * sample_weight[idx_array[i]] - - for j in range(n_features): - centroid[j] /= sum_weight_node - else: - for i in range(idx_start, idx_end): - this_pt = data + n_features * idx_array[i] - for j from 0 <= j < n_features: - centroid[j] += this_pt[j] - - for j in range(n_features): - centroid[j] /= n_points - - # determine Node radius - radius = 0 - for i in range(idx_start, idx_end): - radius = fmax(radius, - tree.rdist(centroid, - data + n_features * idx_array[i], - n_features)) - - node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) - node_data[i_node].idx_start = idx_start - node_data[i_node].idx_end = idx_end - return 0 - - -cdef inline DTYPE_t min_dist(BinaryTree tree, ITYPE_t i_node, - DTYPE_t* pt) except -1 nogil: - """Compute the minimum distance between a point and a node""" - cdef DTYPE_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - return fmax(0, dist_pt - tree.node_data[i_node].radius) - - -cdef inline DTYPE_t max_dist(BinaryTree tree, ITYPE_t i_node, - DTYPE_t* pt) except -1: - """Compute the maximum distance between a point and a node""" - cdef DTYPE_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - return dist_pt + tree.node_data[i_node].radius - - -cdef inline int min_max_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt, - DTYPE_t* min_dist, DTYPE_t* max_dist) except -1 nogil: - """Compute the minimum and maximum distance between a point and a node""" - cdef DTYPE_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - cdef DTYPE_t rad = tree.node_data[i_node].radius - min_dist[0] = fmax(0, dist_pt - rad) - max_dist[0] = dist_pt + rad - return 0 - - -cdef inline DTYPE_t min_rdist(BinaryTree tree, ITYPE_t i_node, - DTYPE_t* pt) except -1 nogil: - """Compute the minimum reduced-distance between a point and a node""" - if tree.euclidean: - return euclidean_dist_to_rdist(min_dist(tree, i_node, pt)) - else: - return tree.dist_metric._dist_to_rdist(min_dist(tree, i_node, pt)) - - -cdef inline DTYPE_t max_rdist(BinaryTree tree, ITYPE_t i_node, - DTYPE_t* pt) except -1: - """Compute the maximum reduced-distance between a point and a node""" - if tree.euclidean: - return euclidean_dist_to_rdist(max_dist(tree, i_node, pt)) - else: - return tree.dist_metric._dist_to_rdist(max_dist(tree, i_node, pt)) - - -cdef inline DTYPE_t min_dist_dual(BinaryTree tree1, ITYPE_t i_node1, - BinaryTree tree2, ITYPE_t i_node2) except -1: - """compute the minimum distance between two nodes""" - cdef DTYPE_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], - &tree1.node_bounds[0, i_node1, 0], - tree1.data.shape[1]) - return fmax(0, (dist_pt - tree1.node_data[i_node1].radius - - tree2.node_data[i_node2].radius)) - - -cdef inline DTYPE_t max_dist_dual(BinaryTree tree1, ITYPE_t i_node1, - BinaryTree tree2, ITYPE_t i_node2) except -1: - """compute the maximum distance between two nodes""" - cdef DTYPE_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], - &tree1.node_bounds[0, i_node1, 0], - tree1.data.shape[1]) - return (dist_pt + tree1.node_data[i_node1].radius - + tree2.node_data[i_node2].radius) - - -cdef inline DTYPE_t min_rdist_dual(BinaryTree tree1, ITYPE_t i_node1, - BinaryTree tree2, ITYPE_t i_node2) except -1: - """compute the minimum reduced distance between two nodes""" - if tree1.euclidean: - return euclidean_dist_to_rdist(min_dist_dual(tree1, i_node1, - tree2, i_node2)) - else: - return tree1.dist_metric._dist_to_rdist(min_dist_dual(tree1, i_node1, - tree2, i_node2)) - - -cdef inline DTYPE_t max_rdist_dual(BinaryTree tree1, ITYPE_t i_node1, - BinaryTree tree2, ITYPE_t i_node2) except -1: - """compute the maximum reduced distance between two nodes""" - if tree1.euclidean: - return euclidean_dist_to_rdist(max_dist_dual(tree1, i_node1, - tree2, i_node2)) - else: - return tree1.dist_metric._dist_to_rdist(max_dist_dual(tree1, i_node1, - tree2, i_node2)) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp new file mode 100644 index 0000000000000..435a932c669a6 --- /dev/null +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -0,0 +1,285 @@ +{{py: + +# Generated file: _ball_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + # An empty string is used for the `name_suffix` of the float64 case + # and '32' bit is used for the `name_suffix` of the float32 case. + # This allows us to use `BinaryTree` conveniently and the default + # float64 case can be used without any particular modifications. + # + # Note: we use the 64bit types as defined in `sklearn.utils._typedefs` + # + ('', 'DTYPE_t', 'DTYPE'), + ('32', 'cnp.float32_t', 'np.float32') +] + +# Author: Jake Vanderplas +# License: BSD 3 clause + +}} + + +__all__ = ['BallTree', 'BallTree32'] + +DOC_DICT = {'BinaryTree': 'BallTree', 'binary_tree': 'ball_tree'} + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +VALID_METRICS{{name_suffix}} = [ + 'EuclideanDistance{{name_suffix}}', + 'SEuclideanDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}', + 'WMinkowskiDistance{{name_suffix}}', + 'MahalanobisDistance{{name_suffix}}', + 'HammingDistance{{name_suffix}}', + 'CanberraDistance{{name_suffix}}', + 'BrayCurtisDistance{{name_suffix}}', + 'JaccardDistance{{name_suffix}}', + 'MatchingDistance{{name_suffix}}', + 'DiceDistance{{name_suffix}}', + 'RogersTanimotoDistance{{name_suffix}}', + 'RussellRaoDistance{{name_suffix}}', + 'SokalMichenerDistance{{name_suffix}}', + 'SokalSneathDistance{{name_suffix}}', + 'PyFuncDistance{{name_suffix}}', + 'HaversineDistance{{name_suffix}}', +] + +{{endfor}} + + +include "_binary_tree.pxi" + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT) + pass + +{{endfor}} + + +#---------------------------------------------------------------------- +# The functions below specialized the Binary Tree as a Ball Tree +# +# Note that these functions use the concept of "reduced distance". +# The reduced distance, defined for some metrics, is a quantity which +# is more efficient to compute than the distance, but preserves the +# relative rankings of the true distance. For example, the reduced +# distance for the Euclidean metric is the squared-euclidean distance. +# For some metrics, the reduced distance is simply the distance. + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t n_nodes, + ITYPE_t n_features, +) except -1: + """Allocate arrays needed for the KD Tree""" + tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype={{INPUT_DTYPE}}) + return 0 + + +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + ITYPE_t i_node, + ITYPE_t idx_start, + ITYPE_t idx_end, +) except -1: + """Initialize the node for the dataset stored in tree.data""" + cdef ITYPE_t n_features = tree.data.shape[1] + cdef ITYPE_t n_points = idx_end - idx_start + + cdef ITYPE_t i, j + cdef DTYPE_t radius + cdef {{INPUT_DTYPE_t}} *this_pt + + cdef ITYPE_t* idx_array = &tree.idx_array[0] + cdef {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] + cdef {{INPUT_DTYPE_t}}* centroid = &tree.node_bounds[0, i_node, 0] + + cdef bint with_sample_weight = tree.sample_weight is not None + cdef {{INPUT_DTYPE_t}}* sample_weight + cdef DTYPE_t sum_weight_node + if with_sample_weight: + sample_weight = &tree.sample_weight[0] + + # determine Node centroid + for j in range(n_features): + centroid[j] = 0 + + if with_sample_weight: + sum_weight_node = 0 + for i in range(idx_start, idx_end): + sum_weight_node += sample_weight[idx_array[i]] + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] * sample_weight[idx_array[i]] + + for j in range(n_features): + centroid[j] /= sum_weight_node + else: + for i in range(idx_start, idx_end): + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] + + for j in range(n_features): + centroid[j] /= n_points + + # determine Node radius + radius = 0 + for i in range(idx_start, idx_end): + radius = fmax(radius, + tree.rdist(centroid, + data + n_features * idx_array[i], + n_features)) + + node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) + node_data[i_node].idx_start = idx_start + node_data[i_node].idx_end = idx_end + return 0 + + +cdef inline DTYPE_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum distance between a point and a node""" + cdef DTYPE_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return fmax(0, dist_pt - tree.node_data[i_node].radius) + + +cdef inline DTYPE_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum distance between a point and a node""" + cdef DTYPE_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return dist_pt + tree.node_data[i_node].radius + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t i_node, + {{INPUT_DTYPE_t}}* pt, + DTYPE_t* min_dist, + DTYPE_t* max_dist, +) except -1 nogil: + """Compute the minimum and maximum distance between a point and a node""" + cdef DTYPE_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + cdef DTYPE_t rad = tree.node_data[i_node].radius + min_dist[0] = fmax(0, dist_pt - rad) + max_dist[0] = dist_pt + rad + return 0 + + +cdef inline DTYPE_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline DTYPE_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline DTYPE_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + ITYPE_t i_node1, + BinaryTree{{name_suffix}} tree2, + ITYPE_t i_node2, +) except -1: + """compute the minimum distance between two nodes""" + cdef DTYPE_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return fmax(0, (dist_pt - tree1.node_data[i_node1].radius + - tree2.node_data[i_node2].radius)) + + +cdef inline DTYPE_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + ITYPE_t i_node1, + BinaryTree{{name_suffix}} tree2, + ITYPE_t i_node2, +) except -1: + """compute the maximum distance between two nodes""" + cdef DTYPE_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return (dist_pt + tree1.node_data[i_node1].radius + + tree2.node_data[i_node2].radius) + + +cdef inline DTYPE_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + ITYPE_t i_node1, + BinaryTree{{name_suffix}} tree2, + ITYPE_t i_node2, +) except -1: + """compute the minimum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + + +cdef inline DTYPE_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + ITYPE_t i_node1, + BinaryTree{{name_suffix}} tree2, + ITYPE_t i_node2, +) except -1: + """compute the maximum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi.tp similarity index 93% rename from sklearn/neighbors/_binary_tree.pxi rename to sklearn/neighbors/_binary_tree.pxi.tp index 99ed4341ad155..96ea2de8bdf2e 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -1,4 +1,33 @@ -#!python +{{py: + +# Generated file: _binary_tree.pxi + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + # An empty string is used for the `name_suffix` of the float64 case + # and '32' bit is used for the `name_suffix` of the float32 case. + # This allows us to use `DistanceMetric` conveniently and the default + # float64 case can be used without any particular modifications. + # + # Note: we use the 64bit types as defined in `sklearn.utils._typedefs` + # + ('', 'DTYPE_t', 'DTYPE'), + ('32', 'cnp.float32_t', 'np.float32') +] + +# KD Tree and Ball Tree +# ===================== +# +# Author: Jake Vanderplas , 2012-2013 +# License: BSD +# +# This file is meant to be a literal include in a pyx file. +# See ball_tree.pyx and kd_tree.pyx + +}} # KD Tree and Ball Tree @@ -143,6 +172,7 @@ # """Compute the maximum distance between two nodes""" cimport numpy as cnp +from cython cimport floating from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma from libc.math cimport fmin, fmax from libc.stdlib cimport calloc, malloc, free @@ -153,9 +183,13 @@ import warnings from ..metrics._dist_metrics cimport ( DistanceMetric, + DistanceMetric32, euclidean_dist, + euclidean_dist32, euclidean_rdist, + euclidean_rdist32, euclidean_dist_to_rdist, + euclidean_dist_to_rdist32, ) from ._partition_nodes cimport partition_node_indices @@ -571,7 +605,7 @@ cdef class NeighborsHeap: # find_node_split_dim: # this computes the equivalent of # j_max = np.argmax(np.max(data, 0) - np.min(data, 0)) -cdef ITYPE_t find_node_split_dim(DTYPE_t* data, +cdef ITYPE_t find_node_split_dim(const floating* data, ITYPE_t* node_indices, ITYPE_t n_features, ITYPE_t n_points) except -1: @@ -766,23 +800,25 @@ from sklearn.metrics._dist_metrics import get_valid_metric_ids VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + ###################################################################### # Binary Tree class -cdef class BinaryTree: +cdef class BinaryTree{{name_suffix}}: - cdef readonly const DTYPE_t[:, ::1] data - cdef readonly const DTYPE_t[::1] sample_weight + cdef readonly const {{INPUT_DTYPE_t}}[:, ::1] data + cdef readonly const {{INPUT_DTYPE_t}}[::1] sample_weight cdef public DTYPE_t sum_weight cdef public const ITYPE_t[::1] idx_array cdef public const NodeData_t[::1] node_data - cdef public const DTYPE_t[:, :, ::1] node_bounds + cdef public const {{INPUT_DTYPE_t}}[:, :, ::1] node_bounds cdef ITYPE_t leaf_size cdef ITYPE_t n_levels cdef ITYPE_t n_nodes - cdef DistanceMetric dist_metric + cdef DistanceMetric{{name_suffix}} dist_metric cdef int euclidean # variables to keep track of building & querying stats @@ -799,11 +835,11 @@ cdef class BinaryTree: # any problem due to potential access to this attribute # (e.g. assigning to NULL or a to value in another segment). def __cinit__(self): - self.data = np.empty((1, 1), dtype=DTYPE, order='C') - self.sample_weight = np.empty(1, dtype=DTYPE, order='C') + self.data = np.empty((1, 1), dtype={{INPUT_DTYPE}}, order='C') + self.sample_weight = np.empty(1, dtype={{INPUT_DTYPE}}, order='C') self.idx_array = np.empty(1, dtype=ITYPE, order='C') self.node_data = np.empty(1, dtype=NodeData, order='C') - self.node_bounds = np.empty((1, 1, 1), dtype=DTYPE) + self.node_bounds = np.empty((1, 1, 1), dtype={{INPUT_DTYPE}}) self.leaf_size = 0 self.n_levels = 0 @@ -819,7 +855,7 @@ cdef class BinaryTree: def __init__(self, data, leaf_size=40, metric='minkowski', sample_weight=None, **kwargs): # validate data - self.data = check_array(data, dtype=DTYPE, order='C') + self.data = check_array(data, dtype={{INPUT_DTYPE}}, order='C') if self.data.size == 0: raise ValueError("X is an empty array") @@ -830,12 +866,12 @@ cdef class BinaryTree: raise ValueError("leaf_size must be greater than or equal to 1") self.leaf_size = leaf_size - self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) + self.dist_metric = DistanceMetric{{name_suffix}}.get_metric(metric, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ == 'EuclideanDistance') metric = self.dist_metric.__class__.__name__ - if metric not in VALID_METRICS: + if metric not in VALID_METRICS{{name_suffix}}: raise ValueError('metric {metric} is not valid for ' '{BinaryTree}'.format(metric=metric, **DOC_DICT)) @@ -855,7 +891,7 @@ cdef class BinaryTree: self._update_sample_weight(n_samples, sample_weight) # Allocate tree-specific data - allocate_data(self, self.n_nodes, n_features) + allocate_data{{name_suffix}}(self, self.n_nodes, n_features) self._recursive_build( node_data=self.node_data.base, i_node=0, @@ -866,7 +902,7 @@ cdef class BinaryTree: def _update_sample_weight(self, n_samples, sample_weight): if sample_weight is not None: self.sample_weight = np.asarray( - sample_weight, dtype=DTYPE, order='C') + sample_weight, dtype={{INPUT_DTYPE}}, order='C') self.sum_weight = np.sum(self.sample_weight) else: self.sample_weight = None @@ -992,16 +1028,16 @@ cdef class BinaryTree: """ return cls._valid_metrics - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2, ITYPE_t size) except -1 nogil: """Compute the distance between arrays x1 and x2""" self.n_calls += 1 if self.euclidean: - return euclidean_dist(x1, x2, size) + return euclidean_dist{{name_suffix}}(x1, x2, size) else: return self.dist_metric.dist(x1, x2, size) - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t rdist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2, ITYPE_t size) except -1 nogil: """Compute the reduced distance between arrays x1 and x2. @@ -1012,7 +1048,7 @@ cdef class BinaryTree: """ self.n_calls += 1 if self.euclidean: - return euclidean_rdist(x1, x2, size) + return euclidean_rdist{{name_suffix}}(x1, x2, size) else: return self.dist_metric.rdist(x1, x2, size) @@ -1033,10 +1069,10 @@ cdef class BinaryTree: cdef ITYPE_t n_points = idx_end - idx_start cdef ITYPE_t n_mid = n_points / 2 cdef ITYPE_t* idx_array = &self.idx_array[idx_start] - cdef DTYPE_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # initialize node data - init_node(self, node_data, i_node, idx_start, idx_end) + init_node{{name_suffix}}(self, node_data, i_node, idx_start, idx_end) if 2 * i_node + 1 >= self.n_nodes: node_data[i_node].is_leaf = True @@ -1113,7 +1149,7 @@ cdef class BinaryTree: corresponding point. """ # XXX: we should allow X to be a pre-built tree. - X = check_array(X, dtype=DTYPE, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " @@ -1125,10 +1161,10 @@ cdef class BinaryTree: # flatten X, and save original shape information np_Xarr = X.reshape((-1, self.data.shape[1])) - cdef const DTYPE_t[:, ::1] Xarr = np_Xarr + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr cdef DTYPE_t reduced_dist_LB cdef ITYPE_t i - cdef DTYPE_t* pt + cdef {{INPUT_DTYPE_t}}* pt # initialize heap for neighbors cdef NeighborsHeap heap = NeighborsHeap(Xarr.shape[0], k) @@ -1151,7 +1187,7 @@ cdef class BinaryTree: if breadth_first: self._query_dual_breadthfirst(other, heap, nodeheap) else: - reduced_dist_LB = min_rdist_dual(self, 0, other, 0) + reduced_dist_LB = min_rdist_dual{{name_suffix}}(self, 0, other, 0) bounds = np.full(other.node_data.shape[0], np.inf) self._query_dual_depthfirst(0, other, 0, bounds, heap, reduced_dist_LB) @@ -1165,7 +1201,7 @@ cdef class BinaryTree: else: with nogil: for i in range(Xarr.shape[0]): - reduced_dist_LB = min_rdist(self, 0, pt) + reduced_dist_LB = min_rdist{{name_suffix}}(self, 0, pt) self._query_single_depthfirst(0, pt, i, heap, reduced_dist_LB) pt += Xarr.shape[1] @@ -1245,18 +1281,18 @@ cdef class BinaryTree: cdef ITYPE_t n_features = self.data.shape[1] cdef DTYPE_t[::1] dist_arr_i cdef ITYPE_t[::1] idx_arr_i, counts - cdef DTYPE_t* pt + cdef {{INPUT_DTYPE_t}}* pt cdef ITYPE_t** indices = NULL cdef DTYPE_t** distances = NULL # validate X and prepare for query - X = check_array(X, dtype=DTYPE, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " "match training data dimension") - cdef const DTYPE_t[:, ::1] Xarr = X.reshape((-1, self.data.shape[1])) + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = X.reshape((-1, self.data.shape[1])) # prepare r for query r = np.asarray(r, dtype=DTYPE, order='C') @@ -1453,18 +1489,18 @@ cdef class BinaryTree: cdef DTYPE_t log_knorm = _log_kernel_norm(h_c, n_features, kernel_c) # validate X and prepare for query - X = check_array(X, dtype=DTYPE, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != n_features: raise ValueError("query data dimension must " "match training data dimension") Xarr_np = X.reshape((-1, n_features)) - cdef DTYPE_t[:, ::1] Xarr = Xarr_np + cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = Xarr_np log_density_arr = np.zeros(Xarr.shape[0], dtype=DTYPE) cdef DTYPE_t[::1] log_density = log_density_arr - cdef DTYPE_t* pt = &Xarr[0, 0] + cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] cdef NodeHeap nodeheap if breadth_first: @@ -1489,7 +1525,7 @@ cdef class BinaryTree: pt += n_features else: for i in range(Xarr.shape[0]): - min_max_dist(self, 0, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, 0, pt, &dist_LB, &dist_UB) # compute max & min bounds on density within top node log_min_bound = (log(self.sum_weight) + compute_log_kernel(dist_UB, @@ -1547,14 +1583,14 @@ cdef class BinaryTree: cdef ITYPE_t i # validate X and prepare for query - X = check_array(X, dtype=DTYPE, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " "match training data dimension") np_Xarr = X.reshape((-1, self.data.shape[1])) - cdef DTYPE_t[:, ::1] Xarr = np_Xarr + cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr # prepare r for query r = np.asarray(r, dtype=DTYPE, order='C') @@ -1569,7 +1605,7 @@ cdef class BinaryTree: count = np.zeros(r.shape[0], dtype=ITYPE) cdef ITYPE_t[::1] carr = count - cdef DTYPE_t* pt = &Xarr[0, 0] + cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] if dualtree: other = self.__class__(Xarr, metric=self.dist_metric, @@ -1585,7 +1621,7 @@ cdef class BinaryTree: return count cdef int _query_single_depthfirst(self, ITYPE_t i_node, - DTYPE_t* pt, ITYPE_t i_pt, + {{INPUT_DTYPE_t}}* pt, ITYPE_t i_pt, NeighborsHeap heap, DTYPE_t reduced_dist_LB) except -1 nogil: """Recursive Single-tree k-neighbors query, depth-first approach""" @@ -1594,7 +1630,7 @@ cdef class BinaryTree: cdef DTYPE_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2 cdef ITYPE_t i, i1, i2 - cdef DTYPE_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] #------------------------------------------------------------ # Case 1: query point is outside node radius: @@ -1619,8 +1655,8 @@ cdef class BinaryTree: self.n_splits += 1 i1 = 2 * i_node + 1 i2 = i1 + 1 - reduced_dist_LB_1 = min_rdist(self, i1, pt) - reduced_dist_LB_2 = min_rdist(self, i2, pt) + reduced_dist_LB_1 = min_rdist{{name_suffix}}(self, i1, pt) + reduced_dist_LB_2 = min_rdist{{name_suffix}}(self, i2, pt) # recursively query subnodes if reduced_dist_LB_1 <= reduced_dist_LB_2: @@ -1635,7 +1671,7 @@ cdef class BinaryTree: reduced_dist_LB_1) return 0 - cdef int _query_single_breadthfirst(self, DTYPE_t* pt, + cdef int _query_single_breadthfirst(self, {{INPUT_DTYPE_t}}* pt, ITYPE_t i_pt, NeighborsHeap heap, NodeHeap nodeheap) except -1: @@ -1643,11 +1679,11 @@ cdef class BinaryTree: cdef ITYPE_t i, i_node cdef DTYPE_t dist_pt, reduced_dist_LB cdef NodeData_t* node_data = &self.node_data[0] - cdef DTYPE_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # Set up the node heap and push the head node onto it cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_rdist(self, 0, pt) + nodeheap_item.val = min_rdist{{name_suffix}}(self, 0, pt) nodeheap_item.i1 = 0 nodeheap.push(nodeheap_item) @@ -1680,12 +1716,12 @@ cdef class BinaryTree: self.n_splits += 1 for i in range(2 * i_node + 1, 2 * i_node + 3): nodeheap_item.i1 = i - nodeheap_item.val = min_rdist(self, i, pt) + nodeheap_item.val = min_rdist{{name_suffix}}(self, i, pt) nodeheap.push(nodeheap_item) return 0 cdef int _query_dual_depthfirst(self, ITYPE_t i_node1, - BinaryTree other, ITYPE_t i_node2, + BinaryTree{{name_suffix}} other, ITYPE_t i_node2, DTYPE_t[::1] bounds, NeighborsHeap heap, DTYPE_t reduced_dist_LB) except -1: @@ -1696,8 +1732,8 @@ cdef class BinaryTree: cdef NodeData_t node_info1 = self.node_data[i_node1] cdef NodeData_t node_info2 = other.node_data[i_node2] - cdef DTYPE_t* data1 = &self.data[0, 0] - cdef DTYPE_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef ITYPE_t n_features = self.data.shape[1] cdef DTYPE_t bound_max, dist_pt, reduced_dist_LB1, reduced_dist_LB2 @@ -1748,9 +1784,9 @@ cdef class BinaryTree: # recursively query, starting with the nearest subnode elif node_info1.is_leaf or (not node_info2.is_leaf and node_info2.radius > node_info1.radius): - reduced_dist_LB1 = min_rdist_dual(self, i_node1, + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, i_node1, other, 2 * i_node2 + 1) - reduced_dist_LB2 = min_rdist_dual(self, i_node1, + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, i_node1, other, 2 * i_node2 + 2) if reduced_dist_LB1 < reduced_dist_LB2: @@ -1768,9 +1804,9 @@ cdef class BinaryTree: # Case 3b: node 2 is a leaf or is smaller: split node 1 and # recursively query, starting with the nearest subnode else: - reduced_dist_LB1 = min_rdist_dual(self, 2 * i_node1 + 1, + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 1, other, i_node2) - reduced_dist_LB2 = min_rdist_dual(self, 2 * i_node1 + 2, + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 2, other, i_node2) if reduced_dist_LB1 < reduced_dist_LB2: @@ -1785,7 +1821,7 @@ cdef class BinaryTree: bounds, heap, reduced_dist_LB1) return 0 - cdef int _query_dual_breadthfirst(self, BinaryTree other, + cdef int _query_dual_breadthfirst(self, BinaryTree{{name_suffix}} other, NeighborsHeap heap, NodeHeap nodeheap) except -1: """Non-recursive dual-tree k-neighbors query, breadth-first""" @@ -1795,13 +1831,13 @@ cdef class BinaryTree: cdef NodeData_t* node_data1 = &self.node_data[0] cdef NodeData_t* node_data2 = &other.node_data[0] cdef NodeData_t node_info1, node_info2 - cdef DTYPE_t* data1 = &self.data[0, 0] - cdef DTYPE_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef ITYPE_t n_features = self.data.shape[1] # Set up the node heap and push the head nodes onto it cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_rdist_dual(self, 0, other, 0) + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, 0, other, 0) nodeheap_item.i1 = 0 nodeheap_item.i2 = 0 nodeheap.push(nodeheap_item) @@ -1853,7 +1889,7 @@ cdef class BinaryTree: nodeheap_item.i1 = i_node1 for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): nodeheap_item.i2 = i2 - nodeheap_item.val = min_rdist_dual(self, i_node1, + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i_node1, other, i2) nodeheap.push(nodeheap_item) @@ -1864,21 +1900,21 @@ cdef class BinaryTree: nodeheap_item.i2 = i_node2 for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): nodeheap_item.i1 = i1 - nodeheap_item.val = min_rdist_dual(self, i1, + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i1, other, i_node2) nodeheap.push(nodeheap_item) return 0 cdef ITYPE_t _query_radius_single(self, ITYPE_t i_node, - DTYPE_t* pt, DTYPE_t r, + {{INPUT_DTYPE_t}}* pt, DTYPE_t r, ITYPE_t* indices, DTYPE_t* distances, ITYPE_t count, int count_only, int return_distance) noexcept nogil: """recursive single-tree radius query, depth-first""" - cdef DTYPE_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef ITYPE_t* idx_array = &self.idx_array[0] cdef ITYPE_t n_features = self.data.shape[1] cdef NodeData_t node_info = self.node_data[i_node] @@ -1887,7 +1923,7 @@ cdef class BinaryTree: cdef DTYPE_t reduced_r cdef DTYPE_t dist_pt, dist_LB = 0, dist_UB = 0 - min_max_dist(self, i_node, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) #------------------------------------------------------------ # Case 1: all node points are outside distance r. @@ -1945,7 +1981,7 @@ cdef class BinaryTree: return count - cdef DTYPE_t _kde_single_breadthfirst(self, DTYPE_t* pt, + cdef DTYPE_t _kde_single_breadthfirst(self, {{INPUT_DTYPE_t}}* pt, KernelType kernel, DTYPE_t h, DTYPE_t log_knorm, DTYPE_t log_atol, DTYPE_t log_rtol, @@ -1965,9 +2001,9 @@ cdef class BinaryTree: cdef DTYPE_t global_log_min_bound, global_log_bound_spread cdef DTYPE_t global_log_max_bound - cdef DTYPE_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef bint with_sample_weight = self.sample_weight is not None - cdef DTYPE_t* sample_weight + cdef {{INPUT_DTYPE_t}}* sample_weight if with_sample_weight: sample_weight = &self.sample_weight[0] cdef ITYPE_t* idx_array = &self.idx_array[0] @@ -1989,13 +2025,13 @@ cdef class BinaryTree: # push the top node to the heap cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_dist(self, 0, pt) + nodeheap_item.val = min_dist{{name_suffix}}(self, 0, pt) nodeheap_item.i1 = 0 nodeheap.push(nodeheap_item) - global_log_min_bound = log(N) + compute_log_kernel(max_dist(self, - 0, pt), - h, kernel) + global_log_min_bound = log(N) + compute_log_kernel( + max_dist{{name_suffix}}(self, 0, pt), h, kernel + ) global_log_max_bound = log(N) + compute_log_kernel(nodeheap_item.val, h, kernel) global_log_bound_spread = logsubexp(global_log_max_bound, @@ -2064,8 +2100,8 @@ cdef class BinaryTree: N1 = node_data[i1].idx_end - node_data[i1].idx_start N2 = node_data[i2].idx_end - node_data[i2].idx_start - min_max_dist(self, i1, pt, &dist_LB_1, &dist_UB_1) - min_max_dist(self, i2, pt, &dist_LB_2, &dist_UB_2) + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB_1, &dist_UB_1) + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB_2, &dist_UB_2) node_log_min_bounds[i1] = (log(N1) + compute_log_kernel(dist_UB_1, @@ -2110,7 +2146,7 @@ cdef class BinaryTree: global_log_bound_spread - log(2)) cdef int _kde_single_depthfirst( - self, ITYPE_t i_node, DTYPE_t* pt, + self, ITYPE_t i_node, {{INPUT_DTYPE_t}}* pt, KernelType kernel, DTYPE_t h, DTYPE_t log_knorm, DTYPE_t log_atol, DTYPE_t log_rtol, @@ -2127,10 +2163,10 @@ cdef class BinaryTree: cdef ITYPE_t i, i1, i2, iw, start, end cdef DTYPE_t N1, N2 - cdef DTYPE_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef NodeData_t* node_data = &self.node_data[0] cdef bint with_sample_weight = self.sample_weight is not None - cdef DTYPE_t* sample_weight + cdef {{INPUT_DTYPE_t}}* sample_weight cdef DTYPE_t log_weight if with_sample_weight: sample_weight = &self.sample_weight[0] @@ -2200,7 +2236,7 @@ cdef class BinaryTree: N1 = (self.node_data[i1].idx_end - self.node_data[i1].idx_start) N2 = (self.node_data[i2].idx_end - self.node_data[i2].idx_start) - min_max_dist(self, i1, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB, &dist_UB) child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h, kernel) child1_log_bound_spread = logsubexp(log(N1) + @@ -2208,7 +2244,7 @@ cdef class BinaryTree: kernel), child1_log_min_bound) - min_max_dist(self, i2, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB, &dist_UB) child2_log_min_bound = log(N2) + compute_log_kernel(dist_UB, h, kernel) child2_log_bound_spread = logsubexp(log(N2) + @@ -2244,11 +2280,11 @@ cdef class BinaryTree: global_log_bound_spread) return 0 - cdef int _two_point_single(self, ITYPE_t i_node, DTYPE_t* pt, DTYPE_t* r, + cdef int _two_point_single(self, ITYPE_t i_node, {{INPUT_DTYPE_t}}* pt, DTYPE_t* r, ITYPE_t* count, ITYPE_t i_min, ITYPE_t i_max) except -1: """recursive single-tree two-point correlation function query""" - cdef DTYPE_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef ITYPE_t* idx_array = &self.idx_array[0] cdef ITYPE_t n_features = self.data.shape[1] cdef NodeData_t node_info = self.node_data[i_node] @@ -2257,7 +2293,7 @@ cdef class BinaryTree: cdef DTYPE_t reduced_r cdef DTYPE_t dist_pt, dist_LB = 0, dist_UB = 0 - min_max_dist(self, i_node, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) #------------------------------------------------------------ # Go through bounds and check for cuts @@ -2294,12 +2330,12 @@ cdef class BinaryTree: return 0 cdef int _two_point_dual(self, ITYPE_t i_node1, - BinaryTree other, ITYPE_t i_node2, + BinaryTree{{name_suffix}} other, ITYPE_t i_node2, DTYPE_t* r, ITYPE_t* count, ITYPE_t i_min, ITYPE_t i_max) except -1: """recursive dual-tree two-point correlation function query""" - cdef DTYPE_t* data1 = &self.data[0, 0] - cdef DTYPE_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef ITYPE_t* idx_array1 = &self.idx_array[0] cdef ITYPE_t* idx_array2 = &other.idx_array[0] cdef NodeData_t node_info1 = self.node_data[i_node1] @@ -2311,8 +2347,8 @@ cdef class BinaryTree: cdef DTYPE_t reduced_r cdef DTYPE_t dist_pt, dist_LB = 0, dist_UB = 0 - dist_LB = min_dist_dual(self, i_node1, other, i_node2) - dist_UB = max_dist_dual(self, i_node1, other, i_node2) + dist_LB = min_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) + dist_UB = max_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) #------------------------------------------------------------ # Go through bounds and check for cuts @@ -2365,6 +2401,7 @@ cdef class BinaryTree: r, count, i_min, i_max) return 0 +{{endfor}} ###################################################################### # Python functions for benchmarking and testing C implementations @@ -2419,7 +2456,7 @@ def nodeheap_sort(DTYPE_t[::1] vals): cdef inline DTYPE_t _total_node_weight(NodeData_t* node_data, - DTYPE_t* sample_weight, + const floating* sample_weight, ITYPE_t* idx_array, ITYPE_t i_node): cdef ITYPE_t i diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx.tp similarity index 66% rename from sklearn/neighbors/_kd_tree.pyx rename to sklearn/neighbors/_kd_tree.pyx.tp index a5db18b4ad772..7e7646b2b926b 100644 --- a/sklearn/neighbors/_kd_tree.pyx +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -1,22 +1,52 @@ +{{py: + +# Generated file: _ball_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + # An empty string is used for the `name_suffix` of the float64 case + # and '32' bit is used for the `name_suffix` of the float32 case. + # This allows us to use `BinaryTree` conveniently and the default + # float64 case can be used without any particular modifications. + # + # Note: we use the 64bit types as defined in `sklearn.utils._typedefs` + # + ('', 'DTYPE_t', 'DTYPE'), + ('32', 'cnp.float32_t', 'np.float32') +] + # By Jake Vanderplas (2013) # written for the scikit-learn project # License: BSD -__all__ = ['KDTree'] +}} + + +__all__ = ['KDTree', 'KDTree32'] DOC_DICT = {'BinaryTree': 'KDTree', 'binary_tree': 'kd_tree'} VALID_METRICS = ['EuclideanDistance', 'ManhattanDistance', 'ChebyshevDistance', 'MinkowskiDistance'] +VALID_METRICS32 = ['EuclideanDistance32', 'ManhattanDistance32', + 'ChebyshevDistance32', 'MinkowskiDistance32'] + include "_binary_tree.pxi" -# Inherit KDTree from BinaryTree -cdef class KDTree(BinaryTree): +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}): __doc__ = CLASS_DOC.format(**DOC_DICT) pass +{{endfor}} + #---------------------------------------------------------------------- # The functions below specialized the Binary Tree as a KD Tree @@ -28,27 +58,36 @@ cdef class KDTree(BinaryTree): # distance for the Euclidean metric is the squared-euclidean distance. # For some metrics, the reduced distance is simply the distance. +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} -cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes, - ITYPE_t n_features) except -1: +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t n_nodes, + ITYPE_t n_features, +) except -1: """Allocate arrays needed for the KD Tree""" - tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype=DTYPE) + tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype={{INPUT_DTYPE}}) return 0 -cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, ITYPE_t i_node, - ITYPE_t idx_start, ITYPE_t idx_end) except -1: +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + ITYPE_t i_node, + ITYPE_t idx_start, + ITYPE_t idx_end, +) except -1: """Initialize the node for the dataset stored in tree.data""" cdef ITYPE_t n_features = tree.data.shape[1] cdef ITYPE_t i, j cdef DTYPE_t rad = 0 - cdef DTYPE_t* lower_bounds = &tree.node_bounds[0, i_node, 0] - cdef DTYPE_t* upper_bounds = &tree.node_bounds[1, i_node, 0] - cdef DTYPE_t* data = &tree.data[0, 0] + cdef {{INPUT_DTYPE_t}}* lower_bounds = &tree.node_bounds[0, i_node, 0] + cdef {{INPUT_DTYPE_t}}* upper_bounds = &tree.node_bounds[1, i_node, 0] + cdef {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] cdef ITYPE_t* idx_array = &tree.idx_array[0] - cdef DTYPE_t* data_row + cdef {{INPUT_DTYPE_t}}* data_row # determine Node bounds for j in range(n_features): @@ -81,8 +120,11 @@ cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, ITYPE_t i_node, return 0 -cdef DTYPE_t min_rdist(BinaryTree tree, ITYPE_t i_node, - DTYPE_t* pt) except -1 nogil: +cdef DTYPE_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: """Compute the minimum reduced-distance between a point and a node""" cdef ITYPE_t n_features = tree.data.shape[1] cdef DTYPE_t d, d_lo, d_hi, rdist=0.0 @@ -105,16 +147,26 @@ cdef DTYPE_t min_rdist(BinaryTree tree, ITYPE_t i_node, return rdist -cdef DTYPE_t min_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt) except -1: +cdef DTYPE_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the minimum distance between a point and a node""" if tree.dist_metric.p == INF: - return min_rdist(tree, i_node, pt) + return min_rdist{{name_suffix}}(tree, i_node, pt) else: - return pow(min_rdist(tree, i_node, pt), 1. / tree.dist_metric.p) + return pow( + min_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) -cdef DTYPE_t max_rdist(BinaryTree tree, - ITYPE_t i_node, DTYPE_t* pt) except -1: +cdef DTYPE_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the maximum reduced-distance between a point and a node""" cdef ITYPE_t n_features = tree.data.shape[1] @@ -134,16 +186,28 @@ cdef DTYPE_t max_rdist(BinaryTree tree, return rdist -cdef DTYPE_t max_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt) except -1: +cdef DTYPE_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the maximum distance between a point and a node""" if tree.dist_metric.p == INF: - return max_rdist(tree, i_node, pt) + return max_rdist{{name_suffix}}(tree, i_node, pt) else: - return pow(max_rdist(tree, i_node, pt), 1. / tree.dist_metric.p) - - -cdef inline int min_max_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt, - DTYPE_t* min_dist, DTYPE_t* max_dist) except -1 nogil: + return pow( + max_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + ITYPE_t i_node, + {{INPUT_DTYPE_t}}* pt, + DTYPE_t* min_dist, + DTYPE_t* max_dist, +) except -1 nogil: """Compute the minimum and maximum distance between a point and a node""" cdef ITYPE_t n_features = tree.data.shape[1] @@ -177,8 +241,12 @@ cdef inline int min_max_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt, return 0 -cdef inline DTYPE_t min_rdist_dual(BinaryTree tree1, ITYPE_t i_node1, - BinaryTree tree2, ITYPE_t i_node2) except -1: +cdef inline DTYPE_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + ITYPE_t i_node1, + BinaryTree{{name_suffix}} tree2, + ITYPE_t i_node2, +) except -1: """Compute the minimum reduced distance between two nodes""" cdef ITYPE_t n_features = tree1.data.shape[1] @@ -208,15 +276,24 @@ cdef inline DTYPE_t min_rdist_dual(BinaryTree tree1, ITYPE_t i_node1, return rdist -cdef inline DTYPE_t min_dist_dual(BinaryTree tree1, ITYPE_t i_node1, - BinaryTree tree2, ITYPE_t i_node2) except -1: +cdef inline DTYPE_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + ITYPE_t i_node1, + BinaryTree{{name_suffix}} tree2, + ITYPE_t i_node2, +) except -1: """Compute the minimum distance between two nodes""" - return tree1.dist_metric._rdist_to_dist(min_rdist_dual(tree1, i_node1, - tree2, i_node2)) + return tree1.dist_metric._rdist_to_dist( + min_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) -cdef inline DTYPE_t max_rdist_dual(BinaryTree tree1, ITYPE_t i_node1, - BinaryTree tree2, ITYPE_t i_node2) except -1: +cdef inline DTYPE_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + ITYPE_t i_node1, + BinaryTree{{name_suffix}} tree2, + ITYPE_t i_node2, +) except -1: """Compute the maximum reduced distance between two nodes""" cdef ITYPE_t n_features = tree1.data.shape[1] @@ -240,8 +317,15 @@ cdef inline DTYPE_t max_rdist_dual(BinaryTree tree1, ITYPE_t i_node1, return rdist -cdef inline DTYPE_t max_dist_dual(BinaryTree tree1, ITYPE_t i_node1, - BinaryTree tree2, ITYPE_t i_node2) except -1: +cdef inline DTYPE_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + ITYPE_t i_node1, + BinaryTree{{name_suffix}} tree2, + ITYPE_t i_node2, +) except -1: """Compute the maximum distance between two nodes""" - return tree1.dist_metric._rdist_to_dist(max_rdist_dual(tree1, i_node1, - tree2, i_node2)) + return tree1.dist_metric._rdist_to_dist( + max_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd index 94b02002d7a1e..05944d660ede8 100644 --- a/sklearn/neighbors/_partition_nodes.pxd +++ b/sklearn/neighbors/_partition_nodes.pxd @@ -1,7 +1,8 @@ +from cython cimport floating from ..utils._typedefs cimport DTYPE_t, ITYPE_t cdef int partition_node_indices( - DTYPE_t *data, + floating *data, ITYPE_t *node_indices, ITYPE_t split_dim, ITYPE_t split_index, diff --git a/sklearn/neighbors/_partition_nodes.pyx b/sklearn/neighbors/_partition_nodes.pyx index f2f655a7de275..6aa6852673913 100644 --- a/sklearn/neighbors/_partition_nodes.pyx +++ b/sklearn/neighbors/_partition_nodes.pyx @@ -63,7 +63,7 @@ cdef extern from *: cdef int partition_node_indices( - DTYPE_t *data, + floating *data, ITYPE_t *node_indices, ITYPE_t split_dim, ITYPE_t split_index, From 2071c0e4e7e94c0f2e4ecab8d47a2a9149acfc7a Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Mon, 20 Mar 2023 16:40:38 +0500 Subject: [PATCH 02/36] Remove 32 variant for VALID_METRICS --- sklearn/neighbors/_ball_tree.pyx.tp | 45 ++++++++++++--------------- sklearn/neighbors/_binary_tree.pxi.tp | 2 +- sklearn/neighbors/_kd_tree.pyx.tp | 4 --- 3 files changed, 21 insertions(+), 30 deletions(-) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index 435a932c669a6..4ea693c47f878 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -28,33 +28,28 @@ __all__ = ['BallTree', 'BallTree32'] DOC_DICT = {'BinaryTree': 'BallTree', 'binary_tree': 'ball_tree'} -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} - -VALID_METRICS{{name_suffix}} = [ - 'EuclideanDistance{{name_suffix}}', - 'SEuclideanDistance{{name_suffix}}', - 'ManhattanDistance{{name_suffix}}', - 'ChebyshevDistance{{name_suffix}}', - 'MinkowskiDistance{{name_suffix}}', - 'WMinkowskiDistance{{name_suffix}}', - 'MahalanobisDistance{{name_suffix}}', - 'HammingDistance{{name_suffix}}', - 'CanberraDistance{{name_suffix}}', - 'BrayCurtisDistance{{name_suffix}}', - 'JaccardDistance{{name_suffix}}', - 'MatchingDistance{{name_suffix}}', - 'DiceDistance{{name_suffix}}', - 'RogersTanimotoDistance{{name_suffix}}', - 'RussellRaoDistance{{name_suffix}}', - 'SokalMichenerDistance{{name_suffix}}', - 'SokalSneathDistance{{name_suffix}}', - 'PyFuncDistance{{name_suffix}}', - 'HaversineDistance{{name_suffix}}', +VALID_METRICS = [ + 'EuclideanDistance', + 'SEuclideanDistance', + 'ManhattanDistance', + 'ChebyshevDistance', + 'MinkowskiDistance', + 'WMinkowskiDistance', + 'MahalanobisDistance', + 'HammingDistance', + 'CanberraDistance', + 'BrayCurtisDistance', + 'JaccardDistance', + 'MatchingDistance', + 'DiceDistance', + 'RogersTanimotoDistance', + 'RussellRaoDistance', + 'SokalMichenerDistance', + 'SokalSneathDistance', + 'PyFuncDistance', + 'HaversineDistance', ] -{{endfor}} - - include "_binary_tree.pxi" {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index 96ea2de8bdf2e..29ddd63eedcd8 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -871,7 +871,7 @@ cdef class BinaryTree{{name_suffix}}: == 'EuclideanDistance') metric = self.dist_metric.__class__.__name__ - if metric not in VALID_METRICS{{name_suffix}}: + if metric not in VALID_METRICS: raise ValueError('metric {metric} is not valid for ' '{BinaryTree}'.format(metric=metric, **DOC_DICT)) diff --git a/sklearn/neighbors/_kd_tree.pyx.tp b/sklearn/neighbors/_kd_tree.pyx.tp index 7e7646b2b926b..f08140aa98478 100644 --- a/sklearn/neighbors/_kd_tree.pyx.tp +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -32,10 +32,6 @@ DOC_DICT = {'BinaryTree': 'KDTree', 'binary_tree': 'kd_tree'} VALID_METRICS = ['EuclideanDistance', 'ManhattanDistance', 'ChebyshevDistance', 'MinkowskiDistance'] -VALID_METRICS32 = ['EuclideanDistance32', 'ManhattanDistance32', - 'ChebyshevDistance32', 'MinkowskiDistance32'] - - include "_binary_tree.pxi" {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} From 9e3506667df1e26e4a588f308395a6782816611f Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Tue, 21 Mar 2023 12:09:24 +0500 Subject: [PATCH 03/36] Fix import issue --- setup.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index ad71b61a1d949..4cdb9762589af 100755 --- a/setup.py +++ b/setup.py @@ -303,8 +303,7 @@ def check_package_status(package, min_version): }, ], "neighbors": [ - {"sources": ["_binary_tree.pxi.tp"], "include_np": True}, - {"sources": ["_ball_tree.pyx.tp"], "include_np": True}, + {"sources": ["_ball_tree.pyx.tp", "_binary_tree.pxi.tp"], "include_np": True}, {"sources": ["_kd_tree.pyx.tp"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, {"sources": ["_quad_tree.pyx"], "include_np": True}, @@ -498,10 +497,10 @@ def configure_extension_modules(): # `source` is a Tempita file tempita_sources.append(source) - # Do not include pxd files that were generated by tempita - if os.path.splitext(new_source_path)[-1] == ".pxd": - continue - sources.append(new_source_path) + # Do not include header files (".pxd") and include files + # (".pxi") that were generated by Tempita. + if os.path.splitext(new_source_path)[-1] not in (".pxd", ".pxi"): + sources.append(new_source_path) gen_from_templates(tempita_sources) From 301afc9d1d2bdf9fcad14aebafb1c4f4ac113a02 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Tue, 21 Mar 2023 19:31:33 +0500 Subject: [PATCH 04/36] Add tests for ball tree --- sklearn/neighbors/_ball_tree.pyx.tp | 10 +++--- sklearn/neighbors/_binary_tree.pxi.tp | 2 +- sklearn/neighbors/tests/test_ball_tree.py | 41 +++++++++++++++++++++-- 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index 2657a5bef1b9f..62b524c601dde 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -96,7 +96,7 @@ cdef int init_node{{name_suffix}}( cdef intp_t n_points = idx_end - idx_start cdef intp_t i, j - cdef float64_t radius + cdef {{INPUT_DTYPE_t}} radius cdef {{INPUT_DTYPE_t}} *this_pt cdef intp_t* idx_array = &tree.idx_array[0] @@ -146,7 +146,7 @@ cdef int init_node{{name_suffix}}( return 0 -cdef inline float64_t min_dist{{name_suffix}}( +cdef inline {{INPUT_DTYPE_t}} min_dist{{name_suffix}}( BinaryTree{{name_suffix}} tree, intp_t i_node, {{INPUT_DTYPE_t}}* pt, @@ -157,7 +157,7 @@ cdef inline float64_t min_dist{{name_suffix}}( return fmax(0, dist_pt - tree.node_data[i_node].radius) -cdef inline float64_t max_dist{{name_suffix}}( +cdef inline {{INPUT_DTYPE_t}} max_dist{{name_suffix}}( BinaryTree{{name_suffix}} tree, intp_t i_node, {{INPUT_DTYPE_t}}* pt, @@ -216,7 +216,7 @@ cdef inline float64_t max_rdist{{name_suffix}}( ) -cdef inline float64_t min_dist_dual{{name_suffix}}( +cdef inline {{INPUT_DTYPE_t}} min_dist_dual{{name_suffix}}( BinaryTree{{name_suffix}} tree1, intp_t i_node1, BinaryTree{{name_suffix}} tree2, @@ -230,7 +230,7 @@ cdef inline float64_t min_dist_dual{{name_suffix}}( - tree2.node_data[i_node2].radius)) -cdef inline float64_t max_dist_dual{{name_suffix}}( +cdef inline {{INPUT_DTYPE_t}} max_dist_dual{{name_suffix}}( BinaryTree{{name_suffix}} tree1, intp_t i_node1, BinaryTree{{name_suffix}} tree2, diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index 24c74a3f7dec7..c09125321392a 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -871,7 +871,7 @@ cdef class BinaryTree{{name_suffix}}: self.euclidean = (self.dist_metric.__class__.__name__ == 'EuclideanDistance') - metric = self.dist_metric.__class__.__name__ + metric = self.dist_metric.__class__.__name__.rstrip("32") if metric not in VALID_METRICS: raise ValueError('metric {metric} is not valid for ' '{BinaryTree}'.format(metric=metric, diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 8d665f799e9d8..5e2a56836bd2c 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -2,8 +2,8 @@ import numpy as np import pytest -from numpy.testing import assert_array_almost_equal -from sklearn.neighbors._ball_tree import BallTree +from numpy.testing import assert_array_almost_equal, assert_allclose, assert_equal +from sklearn.neighbors._ball_tree import BallTree, BallTree32 from sklearn.utils import check_random_state from sklearn.utils.validation import check_array from sklearn.utils._testing import _convert_container @@ -101,3 +101,40 @@ def one_arg_func(x): msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): BallTree(X, metric=one_arg_func) + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_ball_tree_numerical_consistency(metric): + _X = rng.random_sample((40, 3)).round(0) + _Y = rng.random_sample((10, 3)).round(0) + + X_64 = _X.astype(dtype=np.float64) + Y_64 = _Y.astype(dtype=np.float64) + + X_32 = _X.astype(dtype=np.float32) + Y_32 = _Y.astype(dtype=np.float32) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 5 + dist_64, ind_64 = bt_64.query(Y_64, k=k) + dist_32, ind_32 = bt_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32) + assert_equal(ind_64, ind_32) + + # Test consistency with respect to the `query_radius` method + r = 0.3 + ind_64, neighbors_64 = bt_64.query_radius(Y_64[0:2, :], r=r) + ind_32, neighbors_32 = bt_32.query_radius(Y_32[0:2, :], r=r) + assert_equal(ind_64, ind_32) + assert_allclose(neighbors_64, neighbors_32) + + # Test consistency with respect to the `kernel_density` method + kernel = "gaussian" + h = 0.1 + density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel) + density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel) + assert_allclose(density64, density32) From 4e666f9bbd395032ea3afd4814ea39b5bb72e233 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Wed, 22 Mar 2023 17:52:19 +0500 Subject: [PATCH 05/36] Fix tests and apply some suggestions --- sklearn/neighbors/_ball_tree.pyx.tp | 61 +++++++++++++---------- sklearn/neighbors/_binary_tree.pxi.tp | 26 ++++------ sklearn/neighbors/_kd_tree.pyx.tp | 19 +++++-- sklearn/neighbors/_partition_nodes.pyx | 2 + sklearn/neighbors/tests/test_ball_tree.py | 30 +++++------ 5 files changed, 77 insertions(+), 61 deletions(-) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index 62b524c601dde..1e04917d4be0d 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -26,30 +26,37 @@ implementation_specific_values = [ __all__ = ['BallTree', 'BallTree32'] -DOC_DICT = {'BinaryTree': 'BallTree', 'binary_tree': 'ball_tree'} - -VALID_METRICS = [ - 'EuclideanDistance', - 'SEuclideanDistance', - 'ManhattanDistance', - 'ChebyshevDistance', - 'MinkowskiDistance', - 'WMinkowskiDistance', - 'MahalanobisDistance', - 'HammingDistance', - 'CanberraDistance', - 'BrayCurtisDistance', - 'JaccardDistance', - 'MatchingDistance', - 'DiceDistance', - 'RogersTanimotoDistance', - 'RussellRaoDistance', - 'SokalMichenerDistance', - 'SokalSneathDistance', - 'PyFuncDistance', - 'HaversineDistance', +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'BallTree{{name_suffix}}', + 'binary_tree': 'ball_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'EuclideanDistance{{name_suffix}}', + 'SEuclideanDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}', + 'WMinkowskiDistance{{name_suffix}}', + 'MahalanobisDistance{{name_suffix}}', + 'HammingDistance{{name_suffix}}', + 'CanberraDistance{{name_suffix}}', + 'BrayCurtisDistance{{name_suffix}}', + 'JaccardDistance{{name_suffix}}', + 'MatchingDistance{{name_suffix}}', + 'DiceDistance{{name_suffix}}', + 'RogersTanimotoDistance{{name_suffix}}', + 'RussellRaoDistance{{name_suffix}}', + 'SokalMichenerDistance{{name_suffix}}', + 'SokalSneathDistance{{name_suffix}}', + 'PyFuncDistance{{name_suffix}}', + 'HaversineDistance{{name_suffix}}', ] +{{endfor}} + include "_binary_tree.pxi" {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -96,7 +103,7 @@ cdef int init_node{{name_suffix}}( cdef intp_t n_points = idx_end - idx_start cdef intp_t i, j - cdef {{INPUT_DTYPE_t}} radius + cdef float64_t radius cdef {{INPUT_DTYPE_t}} *this_pt cdef intp_t* idx_array = &tree.idx_array[0] @@ -146,7 +153,7 @@ cdef int init_node{{name_suffix}}( return 0 -cdef inline {{INPUT_DTYPE_t}} min_dist{{name_suffix}}( +cdef inline float64_t min_dist{{name_suffix}}( BinaryTree{{name_suffix}} tree, intp_t i_node, {{INPUT_DTYPE_t}}* pt, @@ -157,7 +164,7 @@ cdef inline {{INPUT_DTYPE_t}} min_dist{{name_suffix}}( return fmax(0, dist_pt - tree.node_data[i_node].radius) -cdef inline {{INPUT_DTYPE_t}} max_dist{{name_suffix}}( +cdef inline float64_t max_dist{{name_suffix}}( BinaryTree{{name_suffix}} tree, intp_t i_node, {{INPUT_DTYPE_t}}* pt, @@ -216,7 +223,7 @@ cdef inline float64_t max_rdist{{name_suffix}}( ) -cdef inline {{INPUT_DTYPE_t}} min_dist_dual{{name_suffix}}( +cdef inline float64_t min_dist_dual{{name_suffix}}( BinaryTree{{name_suffix}} tree1, intp_t i_node1, BinaryTree{{name_suffix}} tree2, @@ -230,7 +237,7 @@ cdef inline {{INPUT_DTYPE_t}} min_dist_dual{{name_suffix}}( - tree2.node_data[i_node2].radius)) -cdef inline {{INPUT_DTYPE_t}} max_dist_dual{{name_suffix}}( +cdef inline float64_t max_dist_dual{{name_suffix}}( BinaryTree{{name_suffix}} tree1, intp_t i_node1, BinaryTree{{name_suffix}} tree2, diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index c09125321392a..a9944caf99c1f 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -24,8 +24,8 @@ implementation_specific_values = [ # Author: Jake Vanderplas , 2012-2013 # License: BSD # -# This file is meant to be a literal include in a pyx file. -# See ball_tree.pyx and kd_tree.pyx +# The file generated is then literally included in ball_tree.pyx and kd_tree.pyx. +# See ball_tree.pyx.tp and kd_tree.pyx.tp. }} @@ -33,12 +33,6 @@ implementation_specific_values = [ # KD Tree and Ball Tree # ===================== # -# Author: Jake Vanderplas , 2012-2013 -# License: BSD -# -# This file is meant to be a literal include in a pyx file. -# See ball_tree.pyx and kd_tree.pyx -# # The routines here are the core algorithms of the KDTree and BallTree # structures. If Cython supported polymorphism, we would be able to # create a subclass and derive KDTree and BallTree from it. Because @@ -795,13 +789,13 @@ def newObj(obj): return obj.__new__(obj) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + ###################################################################### -# define the reverse mapping of VALID_METRICS +# define the reverse mapping of VALID_METRICS{{name_suffix}} from sklearn.metrics._dist_metrics import get_valid_metric_ids -VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS) - +VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS{{name_suffix}}) -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} ###################################################################### # Binary Tree class @@ -869,13 +863,13 @@ cdef class BinaryTree{{name_suffix}}: self.dist_metric = DistanceMetric{{name_suffix}}.get_metric(metric, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ - == 'EuclideanDistance') + == 'EuclideanDistance{{name_suffix}}') - metric = self.dist_metric.__class__.__name__.rstrip("32") - if metric not in VALID_METRICS: + metric = self.dist_metric.__class__.__name__ + if metric not in VALID_METRICS{{name_suffix}}: raise ValueError('metric {metric} is not valid for ' '{BinaryTree}'.format(metric=metric, - **DOC_DICT)) + **DOC_DICT{{name_suffix}})) self.dist_metric._validate_data(self.data) # determine number of levels in the tree, and from this diff --git a/sklearn/neighbors/_kd_tree.pyx.tp b/sklearn/neighbors/_kd_tree.pyx.tp index 22c420918d871..cafb44ff2db0c 100644 --- a/sklearn/neighbors/_kd_tree.pyx.tp +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -27,10 +27,21 @@ implementation_specific_values = [ __all__ = ['KDTree', 'KDTree32'] -DOC_DICT = {'BinaryTree': 'KDTree', 'binary_tree': 'kd_tree'} +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'KDTree{{name_suffix}}', + 'binary_tree': 'kd_tree{{name_suffix}}', +} -VALID_METRICS = ['EuclideanDistance', 'ManhattanDistance', - 'ChebyshevDistance', 'MinkowskiDistance'] +VALID_METRICS{{name_suffix}} = [ + 'EuclideanDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}' +] + +{{endfor}} include "_binary_tree.pxi" @@ -38,7 +49,7 @@ include "_binary_tree.pxi" # Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}} cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}): - __doc__ = CLASS_DOC.format(**DOC_DICT) + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) pass {{endfor}} diff --git a/sklearn/neighbors/_partition_nodes.pyx b/sklearn/neighbors/_partition_nodes.pyx index 12ebda8144d0d..011b024fccb14 100644 --- a/sklearn/neighbors/_partition_nodes.pyx +++ b/sklearn/neighbors/_partition_nodes.pyx @@ -16,6 +16,8 @@ # - https://en.cppreference.com/w/cpp/algorithm/nth_element. # - https://github.com/scikit-learn/scikit-learn/pull/11103 # - https://github.com/scikit-learn/scikit-learn/pull/19473 +from cython cimport floating + cdef extern from *: """ diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 5e2a56836bd2c..e5c7cd3ef1b6b 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -9,19 +9,19 @@ from sklearn.utils._testing import _convert_container rng = np.random.RandomState(10) -V_mahalanobis = rng.rand(3, 3) +V_mahalanobis = rng.rand(50, 5) V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T) -DIMENSION = 3 +DIMENSION = 50 METRICS = { "euclidean": {}, "manhattan": {}, "minkowski": dict(p=3), "chebyshev": {}, - "seuclidean": dict(V=rng.random_sample(DIMENSION)), - "wminkowski": dict(p=3, w=rng.random_sample(DIMENSION)), - "mahalanobis": dict(V=V_mahalanobis), + # "seuclidean": dict(V=rng.random_sample(DIMENSION)), + # "wminkowski": dict(p=3, w=rng.random_sample(DIMENSION)), + # "mahalanobis": dict(V=V_mahalanobis), } DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"] @@ -104,15 +104,17 @@ def one_arg_func(x): @pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) -def test_ball_tree_numerical_consistency(metric): - _X = rng.random_sample((40, 3)).round(0) - _Y = rng.random_sample((10, 3)).round(0) +def test_ball_tree_numerical_consistency(global_random_seed, metric): + rng = np.random.RandomState(global_random_seed) + spread = 1000 + _X = rng.rand(100, 50) * spread + _Y = rng.rand(5, 50) * spread - X_64 = _X.astype(dtype=np.float64) - Y_64 = _Y.astype(dtype=np.float64) + X_64 = _X.astype(dtype=np.float64, copy=False) + Y_64 = _Y.astype(dtype=np.float64, copy=False) - X_32 = _X.astype(dtype=np.float32) - Y_32 = _Y.astype(dtype=np.float32) + X_32 = _X.astype(dtype=np.float32, copy=False) + Y_32 = _Y.astype(dtype=np.float32, copy=False) metric_params = METRICS.get(metric, {}) bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) @@ -122,7 +124,7 @@ def test_ball_tree_numerical_consistency(metric): k = 5 dist_64, ind_64 = bt_64.query(Y_64, k=k) dist_32, ind_32 = bt_32.query(Y_32, k=k) - assert_allclose(dist_64, dist_32) + assert_allclose(dist_64, dist_32, rtol=1e-5) assert_equal(ind_64, ind_32) # Test consistency with respect to the `query_radius` method @@ -134,7 +136,7 @@ def test_ball_tree_numerical_consistency(metric): # Test consistency with respect to the `kernel_density` method kernel = "gaussian" - h = 0.1 + h = 0.001 density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel) density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel) assert_allclose(density64, density32) From 45b43a6490948bbcd93703c56c014dd1e11daf46 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Wed, 22 Mar 2023 17:57:58 +0500 Subject: [PATCH 06/36] Add whatsnew entry --- doc/whats_new/v1.3.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 477ce9ac9063a..20d621e9760ff 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -163,7 +163,7 @@ Changelog - |API| The `sample_weight` parameter in `predict` for :meth:`cluster.KMeans.predict` and :meth:`cluster.MiniBatchKMeans.predict` - is now deprecated and will be removed in v1.5. + is now deprecated and will be removed in v1.5. :pr:`25251` by :user:`Gleb Levitski `. - |Enhancement| The `sample_weight` parameter now will be used in centroids @@ -320,6 +320,10 @@ Changelog when `n_neighbors` is large and `algorithm="brute"` with non Euclidean metrics. :pr:`24076` by :user:`Meekail Zain `, :user:`Julien Jerphanion `. +- |Enhancement| :class:`neighbors.KDTree` and :class:`neighbors.BallTree` + now support `numpy.float32`. + :pr:`25914` by :user:`Omar Salman `. + :mod:`sklearn.neural_network` ............................. From 22d4e3792dffec8f36f723ed1053731c77406788 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Wed, 22 Mar 2023 18:00:19 +0500 Subject: [PATCH 07/36] Update setup files --- setup.cfg | 3 +++ setup.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 3ed576cedf92f..df59a7ca6c1bd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -99,6 +99,9 @@ ignore = sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx + sklearn/neighbors/_ball_tree.pyx + sklearn/neighbors/_binary_tree.pxi + sklearn/neighbors/_kd_tree.pyx [codespell] diff --git a/setup.py b/setup.py index 4cdb9762589af..cdc417dd5e605 100755 --- a/setup.py +++ b/setup.py @@ -303,7 +303,8 @@ def check_package_status(package, min_version): }, ], "neighbors": [ - {"sources": ["_ball_tree.pyx.tp", "_binary_tree.pxi.tp"], "include_np": True}, + {"sources": ["_binary_tree.pxi.tp"], "include_np": True}, + {"sources": ["_ball_tree.pyx.tp"], "include_np": True}, {"sources": ["_kd_tree.pyx.tp"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, {"sources": ["_quad_tree.pyx"], "include_np": True}, From af1aa2a1ce652babd96e8555cdefb560774f997e Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Wed, 22 Mar 2023 18:06:19 +0500 Subject: [PATCH 08/36] Update test_ball_tree --- sklearn/neighbors/tests/test_ball_tree.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index e5c7cd3ef1b6b..408c317abeeca 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -9,19 +9,19 @@ from sklearn.utils._testing import _convert_container rng = np.random.RandomState(10) -V_mahalanobis = rng.rand(50, 5) +V_mahalanobis = rng.rand(3, 3) V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T) -DIMENSION = 50 +DIMENSION = 3 METRICS = { "euclidean": {}, "manhattan": {}, "minkowski": dict(p=3), "chebyshev": {}, - # "seuclidean": dict(V=rng.random_sample(DIMENSION)), - # "wminkowski": dict(p=3, w=rng.random_sample(DIMENSION)), - # "mahalanobis": dict(V=V_mahalanobis), + "seuclidean": dict(V=rng.random_sample(DIMENSION)), + "wminkowski": dict(p=3, w=rng.random_sample(DIMENSION)), + "mahalanobis": dict(V=V_mahalanobis), } DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"] @@ -103,7 +103,10 @@ def one_arg_func(x): BallTree(X, metric=one_arg_func) -@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +METRICS_TO_TEST = ["euclidean", "manhattan", "minkowski", "chebyshev"] + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS_TO_TEST, BOOLEAN_METRICS)) def test_ball_tree_numerical_consistency(global_random_seed, metric): rng = np.random.RandomState(global_random_seed) spread = 1000 From 30935501235c0fb6a3de56fb6be697cbf676d9be Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Wed, 22 Mar 2023 18:56:17 +0500 Subject: [PATCH 09/36] Setup.py: continue when sources is empty --- setup.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/setup.py b/setup.py index cdc417dd5e605..0b7f3e788b73c 100755 --- a/setup.py +++ b/setup.py @@ -505,6 +505,12 @@ def configure_extension_modules(): gen_from_templates(tempita_sources) + # Do not progress if we only have a tempita file which we don't + # want to include like the .pxi.tp extension. In such a case + # sources would be empty. + if len(sources) == 0: + continue + # By convention, our extensions always use the name of the first source source_name = os.path.splitext(os.path.basename(sources[0]))[0] if submodule: From 169495fbf0e0a8884436b1f7f6c55a2f19acfea1 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 24 Mar 2023 17:08:54 +0500 Subject: [PATCH 10/36] Preserve dtype for return outputs and adjust tests --- sklearn/neighbors/_binary_tree.pxi.tp | 56 +++++++++++++---------- sklearn/neighbors/tests/test_ball_tree.py | 44 ++++++++++++++---- 2 files changed, 67 insertions(+), 33 deletions(-) diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index a9944caf99c1f..e406af4432db6 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -5,7 +5,7 @@ implementation_specific_values = [ # The values are arranged as follows: # - # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE # # An empty string is used for the `name_suffix` of the float64 case # and '32' bit is used for the `name_suffix` of the float32 case. @@ -14,8 +14,8 @@ implementation_specific_values = [ # # Note: we use the 64bit types as defined in `sklearn.utils._typedefs` # - ('', 'float64_t', 'np.float64'), - ('32', 'float32_t', 'np.float32') + ('', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'), + ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT') ] # KD Tree and Ball Tree @@ -525,8 +525,9 @@ def kernel_norm(h, d, kernel, return_log=False): else: return np.exp(result) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} -cdef class NeighborsHeap: +cdef class NeighborsHeap{{name_suffix}}: """A max-heap structure to keep track of distances/indices of neighbors This implements an efficient pre-allocated set of fixed-size heaps @@ -541,19 +542,19 @@ cdef class NeighborsHeap: n_nbrs : int the size of each heap. """ - cdef float64_t[:, ::1] distances + cdef {{INPUT_DTYPE_t}}[:, ::1] distances cdef intp_t[:, ::1] indices def __cinit__(self): # One-element arrays are used as placeholders to prevent # any problem due to potential access to those attributes # (e.g. assigning to NULL or a to value in another segment). - self.distances = np.zeros((1, 1), dtype=np.float64, order='C') + self.distances = np.zeros((1, 1), dtype={{INPUT_DTYPE}}, order='C') self.indices = np.zeros((1, 1), dtype=np.intp, order='C') def __init__(self, n_pts, n_nbrs): self.distances = np.full( - (n_pts, n_nbrs), np.inf, dtype=np.float64, order='C' + (n_pts, n_nbrs), np.inf, dtype={{INPUT_DTYPE}}, order='C' ) self.indices = np.zeros((n_pts, n_nbrs), dtype=np.intp, order='C') @@ -596,6 +597,8 @@ cdef class NeighborsHeap: ) return 0 +{{endfor}} + #------------------------------------------------------------ # find_node_split_dim: # this computes the equivalent of @@ -789,7 +792,7 @@ def newObj(obj): return obj.__new__(obj) -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} ###################################################################### # define the reverse mapping of VALID_METRICS{{name_suffix}} @@ -1162,7 +1165,7 @@ cdef class BinaryTree{{name_suffix}}: cdef {{INPUT_DTYPE_t}}* pt # initialize heap for neighbors - cdef NeighborsHeap heap = NeighborsHeap(Xarr.shape[0], k) + cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(Xarr.shape[0], k) # node heap for breadth-first queries cdef NodeHeap nodeheap @@ -1274,11 +1277,11 @@ cdef class BinaryTree{{name_suffix}}: cdef intp_t i, count_i = 0 cdef intp_t n_features = self.data.shape[1] - cdef float64_t[::1] dist_arr_i + cdef {{INPUT_DTYPE_t}}[::1] dist_arr_i cdef intp_t[::1] idx_arr_i, counts cdef {{INPUT_DTYPE_t}}* pt cdef intp_t** indices = NULL - cdef float64_t** distances = NULL + cdef {{INPUT_DTYPE_t}}** distances = NULL # validate X and prepare for query X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') @@ -1306,7 +1309,7 @@ cdef class BinaryTree{{name_suffix}}: if indices == NULL: raise MemoryError() if return_distance: - distances = calloc(Xarr.shape[0], sizeof(float64_t*)) + distances = <{{INPUT_DTYPE_t}}**>calloc(Xarr.shape[0], sizeof({{INPUT_DTYPE_t}}*)) if distances == NULL: free(indices) raise MemoryError() @@ -1314,7 +1317,7 @@ cdef class BinaryTree{{name_suffix}}: np_idx_arr = np.zeros(self.data.shape[0], dtype=np.intp) idx_arr_i = np_idx_arr - np_dist_arr = np.zeros(self.data.shape[0], dtype=np.float64) + np_dist_arr = np.zeros(self.data.shape[0], dtype={{INPUT_DTYPE}}) dist_arr_i = np_dist_arr counts_arr = np.zeros(Xarr.shape[0], dtype=np.intp) @@ -1347,13 +1350,16 @@ cdef class BinaryTree{{name_suffix}}: if return_distance: # equivalent to: distances[i] = np_dist_arr[:counts[i]].copy() - distances[i] = malloc(counts[i] * sizeof(float64_t)) + distances[i] = <{{INPUT_DTYPE_t}}*>malloc(counts[i] * sizeof({{INPUT_DTYPE_t}})) if distances[i] == NULL: memory_error = True break - memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof(float64_t)) + memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof({{INPUT_DTYPE_t}})) try: + for i in range(Xarr.shape[0]): + print(distances[i]) + if memory_error: raise MemoryError() @@ -1374,7 +1380,7 @@ cdef class BinaryTree{{name_suffix}}: # make a new numpy array that wraps the existing data # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 - distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_DOUBLE, distances[i]) + distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], {{NPY_TYPE}}, distances[i]) # make sure the data will be freed when the numpy array is garbage collected PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_ARRAY_OWNDATA) # make sure the data is not freed twice @@ -1495,8 +1501,8 @@ cdef class BinaryTree{{name_suffix}}: Xarr_np = X.reshape((-1, n_features)) cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = Xarr_np - log_density_arr = np.zeros(Xarr.shape[0], dtype=np.float64) - cdef float64_t[::1] log_density = log_density_arr + log_density_arr = np.zeros(Xarr.shape[0], dtype={{INPUT_DTYPE}}) + cdef {{INPUT_DTYPE_t}}[::1] log_density = log_density_arr cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] @@ -1620,7 +1626,7 @@ cdef class BinaryTree{{name_suffix}}: cdef int _query_single_depthfirst(self, intp_t i_node, {{INPUT_DTYPE_t}}* pt, intp_t i_pt, - NeighborsHeap heap, + NeighborsHeap{{name_suffix}} heap, float64_t reduced_dist_LB) except -1 nogil: """Recursive Single-tree k-neighbors query, depth-first approach""" cdef NodeData_t node_info = self.node_data[i_node] @@ -1671,7 +1677,7 @@ cdef class BinaryTree{{name_suffix}}: cdef int _query_single_breadthfirst(self, {{INPUT_DTYPE_t}}* pt, intp_t i_pt, - NeighborsHeap heap, + NeighborsHeap{{name_suffix}} heap, NodeHeap nodeheap) except -1: """Non-recursive single-tree k-neighbors query, breadth-first search""" cdef intp_t i, i_node @@ -1721,7 +1727,7 @@ cdef class BinaryTree{{name_suffix}}: cdef int _query_dual_depthfirst(self, intp_t i_node1, BinaryTree{{name_suffix}} other, intp_t i_node2, float64_t[::1] bounds, - NeighborsHeap heap, + NeighborsHeap{{name_suffix}} heap, float64_t reduced_dist_LB) except -1: """Recursive dual-tree k-neighbors query, depth-first""" # note that the array `bounds` is maintained such that @@ -1820,7 +1826,7 @@ cdef class BinaryTree{{name_suffix}}: return 0 cdef int _query_dual_breadthfirst(self, BinaryTree{{name_suffix}} other, - NeighborsHeap heap, + NeighborsHeap{{name_suffix}} heap, NodeHeap nodeheap) except -1: """Non-recursive dual-tree k-neighbors query, breadth-first""" cdef intp_t i, i1, i2, i_node1, i_node2, i_pt @@ -1907,7 +1913,7 @@ cdef class BinaryTree{{name_suffix}}: intp_t i_node, {{INPUT_DTYPE_t}}* pt, float64_t r, intp_t* indices, - float64_t* distances, + {{INPUT_DTYPE_t}}* distances, intp_t count, int count_only, int return_distance) noexcept nogil: @@ -2414,8 +2420,9 @@ def load_heap(float64_t[:, ::1] X, intp_t k): heap._push(i, X[i, j], j) return heap.get_arrays() +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} -def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): +def simultaneous_sort({{INPUT_DTYPE_t}}[:, ::1] distances, intp_t[:, ::1] indices): """In-place simultaneous sort the given row of the arrays This python wrapper exists primarily to enable unit testing @@ -2429,6 +2436,7 @@ def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): &indices[row, 0], distances.shape[1]) +{{endfor}} def nodeheap_sort(float64_t[::1] vals): """In-place reverse sort of vals using NodeHeap""" diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 408c317abeeca..0897aa93d8d39 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -109,9 +109,8 @@ def one_arg_func(x): @pytest.mark.parametrize("metric", itertools.chain(METRICS_TO_TEST, BOOLEAN_METRICS)) def test_ball_tree_numerical_consistency(global_random_seed, metric): rng = np.random.RandomState(global_random_seed) - spread = 1000 - _X = rng.rand(100, 50) * spread - _Y = rng.rand(5, 50) * spread + _X = rng.rand(100, 50) + _Y = rng.rand(5, 50) X_64 = _X.astype(dtype=np.float64, copy=False) Y_64 = _Y.astype(dtype=np.float64, copy=False) @@ -131,15 +130,42 @@ def test_ball_tree_numerical_consistency(global_random_seed, metric): assert_equal(ind_64, ind_32) # Test consistency with respect to the `query_radius` method - r = 0.3 + r = 2.38 ind_64, neighbors_64 = bt_64.query_radius(Y_64[0:2, :], r=r) ind_32, neighbors_32 = bt_32.query_radius(Y_32[0:2, :], r=r) assert_equal(ind_64, ind_32) - assert_allclose(neighbors_64, neighbors_32) + assert_allclose( + neighbors_64, + neighbors_32, + ) + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = bt_64.query_radius(Y_64[4:5, :], r=r, return_distance=True) + ind_32, dist_32 = bt_32.query_radius(Y_32[4:5, :], r=r, return_distance=True) + assert_equal(ind_64[0], ind_32[0]) + assert_allclose(dist_64[0], dist_32[0], rtol=1e-5) + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS_TO_TEST, BOOLEAN_METRICS)) +def test_ball_tree_numerical_consistency_kernel_density(global_random_seed, metric): # Test consistency with respect to the `kernel_density` method + rng = np.random.RandomState(global_random_seed) + _X = rng.random_sample((100, 3)) + _Y = rng.random_sample((5, 3)) + + X_64 = _X.astype(dtype=np.float64, copy=False) + Y_64 = _Y.astype(dtype=np.float64, copy=False) + + X_32 = _X.astype(dtype=np.float32, copy=False) + Y_32 = _Y.astype(dtype=np.float32, copy=False) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + kernel = "gaussian" - h = 0.001 - density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel) - density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel) - assert_allclose(density64, density32) + h = 0.1 + density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) From ba1ce1bac71c8f97315bce8ab9ed24235cdbe8cd Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 24 Mar 2023 17:18:44 +0500 Subject: [PATCH 11/36] Add a test for two point correlation --- sklearn/neighbors/tests/test_ball_tree.py | 24 ++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 0897aa93d8d39..5f016add3f22e 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -148,7 +148,7 @@ def test_ball_tree_numerical_consistency(global_random_seed, metric): @pytest.mark.parametrize("metric", itertools.chain(METRICS_TO_TEST, BOOLEAN_METRICS)) -def test_ball_tree_numerical_consistency_kernel_density(global_random_seed, metric): +def test_kernel_density_numerical_consistency(global_random_seed, metric): # Test consistency with respect to the `kernel_density` method rng = np.random.RandomState(global_random_seed) _X = rng.random_sample((100, 3)) @@ -169,3 +169,25 @@ def test_ball_tree_numerical_consistency_kernel_density(global_random_seed, metr density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) assert_allclose(density64, density32, rtol=1e-5) + + +def test_two_point_correlation_numerical_consistency(global_random_seed): + # Test consistency with respect to the `two_point_correlation` method + rng = np.random.RandomState(global_random_seed) + _X = rng.random_sample((100, 3)) + _Y = rng.random_sample((5, 3)) + + X_64 = _X.astype(dtype=np.float64, copy=False) + Y_64 = _Y.astype(dtype=np.float64, copy=False) + + X_32 = _X.astype(dtype=np.float32, copy=False) + Y_32 = _Y.astype(dtype=np.float32, copy=False) + + bt_64 = BallTree(X_64, leaf_size=10) + bt_32 = BallTree32(X_32, leaf_size=10) + + r = np.linspace(0, 1, 10) + + counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True) + counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True) + assert_allclose(counts_64, counts_32) From c7d959aefa94e4d036bce75dee735019245789f3 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 24 Mar 2023 17:21:50 +0500 Subject: [PATCH 12/36] Remove debug print --- sklearn/neighbors/_binary_tree.pxi.tp | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index e406af4432db6..6f0b5418d1ca5 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -1357,9 +1357,6 @@ cdef class BinaryTree{{name_suffix}}: memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof({{INPUT_DTYPE_t}})) try: - for i in range(Xarr.shape[0]): - print(distances[i]) - if memory_error: raise MemoryError() From 66c578ac751e946648b6a0f3a92b1b286dab2d5c Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 24 Mar 2023 17:30:11 +0500 Subject: [PATCH 13/36] Add assertions for float dtypes --- sklearn/neighbors/tests/test_ball_tree.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 5f016add3f22e..55d15f68c3f2a 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -128,6 +128,8 @@ def test_ball_tree_numerical_consistency(global_random_seed, metric): dist_32, ind_32 = bt_32.query(Y_32, k=k) assert_allclose(dist_64, dist_32, rtol=1e-5) assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 # Test consistency with respect to the `query_radius` method r = 2.38 @@ -145,6 +147,8 @@ def test_ball_tree_numerical_consistency(global_random_seed, metric): ind_32, dist_32 = bt_32.query_radius(Y_32[4:5, :], r=r, return_distance=True) assert_equal(ind_64[0], ind_32[0]) assert_allclose(dist_64[0], dist_32[0], rtol=1e-5) + assert dist_64[0].dtype == np.float64 + assert dist_32[0].dtype == np.float32 @pytest.mark.parametrize("metric", itertools.chain(METRICS_TO_TEST, BOOLEAN_METRICS)) @@ -169,6 +173,8 @@ def test_kernel_density_numerical_consistency(global_random_seed, metric): density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 def test_two_point_correlation_numerical_consistency(global_random_seed): From 2368aa850719e27f001e59b4ada0aa1ba73da7e6 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 24 Mar 2023 17:33:21 +0500 Subject: [PATCH 14/36] Rename METRICS_TO_TEST --- sklearn/neighbors/tests/test_ball_tree.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 55d15f68c3f2a..f5e6ec496f557 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -19,6 +19,9 @@ "manhattan": {}, "minkowski": dict(p=3), "chebyshev": {}, +} + +ADDITIONAL_METRICS = { "seuclidean": dict(V=rng.random_sample(DIMENSION)), "wminkowski": dict(p=3, w=rng.random_sample(DIMENSION)), "mahalanobis": dict(V=V_mahalanobis), @@ -103,10 +106,7 @@ def one_arg_func(x): BallTree(X, metric=one_arg_func) -METRICS_TO_TEST = ["euclidean", "manhattan", "minkowski", "chebyshev"] - - -@pytest.mark.parametrize("metric", itertools.chain(METRICS_TO_TEST, BOOLEAN_METRICS)) +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) def test_ball_tree_numerical_consistency(global_random_seed, metric): rng = np.random.RandomState(global_random_seed) _X = rng.rand(100, 50) @@ -151,7 +151,7 @@ def test_ball_tree_numerical_consistency(global_random_seed, metric): assert dist_32[0].dtype == np.float32 -@pytest.mark.parametrize("metric", itertools.chain(METRICS_TO_TEST, BOOLEAN_METRICS)) +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) def test_kernel_density_numerical_consistency(global_random_seed, metric): # Test consistency with respect to the `kernel_density` method rng = np.random.RandomState(global_random_seed) From 5b6c2bc76ed7e763ca7f6251e749bb6ed8507a24 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 24 Mar 2023 17:50:55 +0500 Subject: [PATCH 15/36] Revert tempita on simultaneous_sort function --- sklearn/neighbors/_binary_tree.pxi.tp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index 6f0b5418d1ca5..9737ff05e439c 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -2417,9 +2417,8 @@ def load_heap(float64_t[:, ::1] X, intp_t k): heap._push(i, X[i, j], j) return heap.get_arrays() -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} -def simultaneous_sort({{INPUT_DTYPE_t}}[:, ::1] distances, intp_t[:, ::1] indices): +def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): """In-place simultaneous sort the given row of the arrays This python wrapper exists primarily to enable unit testing @@ -2433,7 +2432,6 @@ def simultaneous_sort({{INPUT_DTYPE_t}}[:, ::1] distances, intp_t[:, ::1] indice &indices[row, 0], distances.shape[1]) -{{endfor}} def nodeheap_sort(float64_t[::1] vals): """In-place reverse sort of vals using NodeHeap""" From 96d54c1764720ea966097871e9ef421b8062b1bf Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Mon, 27 Mar 2023 14:05:49 +0500 Subject: [PATCH 16/36] Add similar tests for kd tree --- sklearn/neighbors/tests/test_ball_tree.py | 52 ++++++++++++------ sklearn/neighbors/tests/test_kd_tree.py | 67 ++++++++++++++++++++++- 2 files changed, 100 insertions(+), 19 deletions(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index f5e6ec496f557..a0236c10708b3 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -108,15 +108,9 @@ def one_arg_func(x): @pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) def test_ball_tree_numerical_consistency(global_random_seed, metric): - rng = np.random.RandomState(global_random_seed) - _X = rng.rand(100, 50) - _Y = rng.rand(5, 50) - - X_64 = _X.astype(dtype=np.float64, copy=False) - Y_64 = _Y.astype(dtype=np.float64, copy=False) - - X_32 = _X.astype(dtype=np.float32, copy=False) - Y_32 = _Y.astype(dtype=np.float32, copy=False) + X_64, X_32, Y_64, Y_32 = get_dataset_for_query_methods( + random_seed=global_random_seed + ) metric_params = METRICS.get(metric, {}) bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) @@ -154,15 +148,9 @@ def test_ball_tree_numerical_consistency(global_random_seed, metric): @pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) def test_kernel_density_numerical_consistency(global_random_seed, metric): # Test consistency with respect to the `kernel_density` method - rng = np.random.RandomState(global_random_seed) - _X = rng.random_sample((100, 3)) - _Y = rng.random_sample((5, 3)) - - X_64 = _X.astype(dtype=np.float64, copy=False) - Y_64 = _Y.astype(dtype=np.float64, copy=False) - - X_32 = _X.astype(dtype=np.float32, copy=False) - Y_32 = _Y.astype(dtype=np.float32, copy=False) + X_64, X_32, Y_64, Y_32 = get_dataset_for_kernel_density( + random_seed=global_random_seed + ) metric_params = METRICS.get(metric, {}) bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) @@ -197,3 +185,31 @@ def test_two_point_correlation_numerical_consistency(global_random_seed): counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True) counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True) assert_allclose(counts_64, counts_32) + + +def get_dataset_for_query_methods(random_seed): + rng = np.random.RandomState(random_seed) + _X = rng.rand(100, 50) + _Y = rng.rand(5, 50) + + X_64 = _X.astype(dtype=np.float64, copy=False) + Y_64 = _Y.astype(dtype=np.float64, copy=False) + + X_32 = _X.astype(dtype=np.float32, copy=False) + Y_32 = _Y.astype(dtype=np.float32, copy=False) + + return X_64, X_32, Y_64, Y_32 + + +def get_dataset_for_kernel_density(random_seed): + rng = np.random.RandomState(random_seed) + _X = rng.random_sample((100, 3)) + _Y = rng.random_sample((5, 3)) + + X_64 = _X.astype(dtype=np.float64, copy=False) + Y_64 = _Y.astype(dtype=np.float64, copy=False) + + X_32 = _X.astype(dtype=np.float32, copy=False) + Y_32 = _Y.astype(dtype=np.float32, copy=False) + + return X_64, X_32, Y_64, Y_32 diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 525c15436e24c..20a2814f970f3 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -1,8 +1,14 @@ import numpy as np import pytest +from numpy.testing import assert_allclose, assert_equal + +from sklearn.neighbors.tests.test_ball_tree import ( + get_dataset_for_query_methods, + get_dataset_for_kernel_density, +) from sklearn.utils.parallel import delayed, Parallel -from sklearn.neighbors._kd_tree import KDTree +from sklearn.neighbors._kd_tree import KDTree, KDTree32 DIMENSION = 3 @@ -28,3 +34,62 @@ def test_kdtree_picklable_with_joblib(): # use to raise "ValueError: buffer source array is read-only" in a previous # version of the Cython code. Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X]) + + +@pytest.mark.parametrize("metric", METRICS) +def test_kd_tree_numerical_consistency(global_random_seed, metric): + X_64, X_32, Y_64, Y_32 = get_dataset_for_query_methods( + random_seed=global_random_seed + ) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 4 + dist_64, ind_64 = kd_64.query(Y_64, k=k) + dist_32, ind_32 = kd_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64, neighbors_64 = kd_64.query_radius(Y_64[0:2, :], r=r) + ind_32, neighbors_32 = kd_32.query_radius(Y_32[0:2, :], r=r) + assert_equal(ind_64, ind_32) + assert_allclose( + neighbors_64, + neighbors_32, + ) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = kd_64.query_radius(Y_64[4:5, :], r=r, return_distance=True) + ind_32, dist_32 = kd_32.query_radius(Y_32[4:5, :], r=r, return_distance=True) + assert_equal(ind_64[0], ind_32[0]) + assert_allclose(dist_64[0], dist_32[0], rtol=1e-5) + assert dist_64[0].dtype == np.float64 + assert dist_32[0].dtype == np.float32 + + +@pytest.mark.parametrize("metric", METRICS) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_kernel_density( + random_seed=global_random_seed + ) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 From 88012420a8cc9cd4dc7ce85c07ebe0e16bad9786 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 5 Jun 2023 12:12:46 +0500 Subject: [PATCH 17/36] PR suggestions --- sklearn/neighbors/_ball_tree.pyx.tp | 2 +- sklearn/neighbors/_binary_tree.pxi.tp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index 1e04917d4be0d..bfbaa486d9cd5 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -63,7 +63,7 @@ include "_binary_tree.pxi" # Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}} cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}): - __doc__ = CLASS_DOC.format(**DOC_DICT) + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) pass {{endfor}} diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index 9737ff05e439c..d6bfcf7be901a 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -797,7 +797,7 @@ def newObj(obj): ###################################################################### # define the reverse mapping of VALID_METRICS{{name_suffix}} from sklearn.metrics._dist_metrics import get_valid_metric_ids -VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS{{name_suffix}}) +VALID_METRIC_IDS{{name_suffix}} = get_valid_metric_ids(VALID_METRICS{{name_suffix}}) ###################################################################### @@ -825,7 +825,7 @@ cdef class BinaryTree{{name_suffix}}: cdef int n_splits cdef int n_calls - _valid_metrics = VALID_METRIC_IDS + _valid_metrics = VALID_METRIC_IDS{{name_suffix}} # Use cinit to initialize all arrays to empty: this will prevent memory # errors and seg-faults in rare cases where __init__ is not called From 5a2c1e776da59872113b004c8b83f9644fa7b6b5 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Wed, 7 Jun 2023 10:24:30 +0500 Subject: [PATCH 18/36] Additional changes needed to sync with main and include the distance metric updates --- sklearn/neighbors/__init__.py | 4 ++-- sklearn/neighbors/_ball_tree.pyx | 0 sklearn/neighbors/_ball_tree.pyx.tp | 7 +++---- sklearn/neighbors/_base.py | 4 ++-- sklearn/neighbors/_binary_tree.pxi.tp | 13 ++++++++++--- sklearn/neighbors/_kd_tree.pyx.tp | 7 +++---- sklearn/neighbors/_kde.py | 4 ++-- sklearn/neighbors/tests/test_ball_tree.py | 18 +++++++++--------- sklearn/neighbors/tests/test_kd_tree.py | 10 +++++----- 9 files changed, 36 insertions(+), 31 deletions(-) delete mode 100644 sklearn/neighbors/_ball_tree.pyx diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 8223c20991904..13abefc6298a3 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -3,8 +3,8 @@ algorithm. """ -from ._ball_tree import BallTree -from ._kd_tree import KDTree +from ._ball_tree import BallTree64 as BallTree +from ._kd_tree import KDTree64 as KDTree from ._graph import kneighbors_graph, radius_neighbors_graph from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer from ._unsupervised import NearestNeighbors diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index bfbaa486d9cd5..2f9a5eae23243 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -9,12 +9,11 @@ implementation_specific_values = [ # # An empty string is used for the `name_suffix` of the float64 case # and '32' bit is used for the `name_suffix` of the float32 case. - # This allows us to use `BinaryTree` conveniently and the default - # float64 case can be used without any particular modifications. + # This allows us to use `BinaryTree` conveniently. # # Note: we use the 64bit types as defined in `sklearn.utils._typedefs` # - ('', 'float64_t', 'np.float64'), + ('64', 'float64_t', 'np.float64'), ('32', 'float32_t', 'np.float32') ] @@ -24,7 +23,7 @@ implementation_specific_values = [ }} -__all__ = ['BallTree', 'BallTree32'] +__all__ = ['BallTree64', 'BallTree32'] {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index c812149970e81..f57904f1380ef 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -18,8 +18,8 @@ from scipy.sparse import csr_matrix, issparse from joblib import effective_n_jobs -from ._ball_tree import BallTree -from ._kd_tree import KDTree +from ._ball_tree import BallTree64 as BallTree +from ._kd_tree import KDTree64 as KDTree from ..base import BaseEstimator, MultiOutputMixin from ..base import is_classifier from ..metrics import pairwise_distances_chunked diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index 3ce01acc0fac9..e3fe9cab5894b 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -14,7 +14,7 @@ implementation_specific_values = [ # # Note: we use the 64bit types as defined in `sklearn.utils._typedefs` # - ('', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'), + ('64', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'), ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT') ] @@ -178,9 +178,13 @@ import warnings from ..metrics._dist_metrics cimport ( DistanceMetric, DistanceMetric64, + DistanceMetric32, euclidean_dist64, + euclidean_dist32, euclidean_rdist64, + euclidean_rdist32, euclidean_dist_to_rdist64, + euclidean_dist_to_rdist32, ) from ._partition_nodes cimport partition_node_indices @@ -860,7 +864,7 @@ cdef class BinaryTree{{name_suffix}}: raise ValueError("leaf_size must be greater than or equal to 1") self.leaf_size = leaf_size - self.dist_metric = DistanceMetric{{name_suffix}}.get_metric(metric, **kwargs) + self.dist_metric = DistanceMetric.get_metric(metric, dtype={{INPUT_DTYPE}}, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ == 'EuclideanDistance{{name_suffix}}') @@ -2403,16 +2407,19 @@ cdef class BinaryTree{{name_suffix}}: ###################################################################### # Python functions for benchmarking and testing C implementations +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} + def load_heap(float64_t[:, ::1] X, intp_t k): """test fully loading the heap""" assert k <= X.shape[1] - cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(X.shape[0], k) cdef intp_t i, j for i in range(X.shape[0]): for j in range(X.shape[1]): heap._push(i, X[i, j], j) return heap.get_arrays() +{{endfor}} def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): """In-place simultaneous sort the given row of the arrays diff --git a/sklearn/neighbors/_kd_tree.pyx.tp b/sklearn/neighbors/_kd_tree.pyx.tp index 4c543187d788d..000da915122a6 100644 --- a/sklearn/neighbors/_kd_tree.pyx.tp +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -9,12 +9,11 @@ implementation_specific_values = [ # # An empty string is used for the `name_suffix` of the float64 case # and '32' bit is used for the `name_suffix` of the float32 case. - # This allows us to use `BinaryTree` conveniently and the default - # float64 case can be used without any particular modifications. + # This allows us to use `BinaryTree` conveniently. # # Note: we use the 64bit types as defined in `sklearn.utils._typedefs` # - ('', 'float64_t', 'np.float64'), + ('64', 'float64_t', 'np.float64'), ('32', 'float32_t', 'np.float32') ] @@ -25,7 +24,7 @@ implementation_specific_values = [ }} -__all__ = ['KDTree', 'KDTree32'] +__all__ = ['KDTree64', 'KDTree32'] {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index f285b03403b5f..74dc486f17b35 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -15,8 +15,8 @@ from ..utils.validation import _check_sample_weight, check_is_fitted from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import row_norms -from ._ball_tree import BallTree -from ._kd_tree import KDTree +from ._ball_tree import BallTree64 as BallTree +from ._kd_tree import KDTree64 as KDTree VALID_KERNELS = [ diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 37ef03386d747..f5c9e1986a6c1 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -3,7 +3,7 @@ import numpy as np import pytest from numpy.testing import assert_array_almost_equal, assert_allclose, assert_equal -from sklearn.neighbors._ball_tree import BallTree, BallTree32 +from sklearn.neighbors._ball_tree import BallTree64, BallTree32 from sklearn.utils import check_random_state from sklearn.utils.validation import check_array from sklearn.utils._testing import _convert_container @@ -64,7 +64,7 @@ def test_ball_tree_query_metrics(metric, array_type): k = 5 - bt = BallTree(X, leaf_size=1, metric=metric) + bt = BallTree64(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2) @@ -73,7 +73,7 @@ def test_ball_tree_query_metrics(metric, array_type): def test_query_haversine(): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) - bt = BallTree(X, leaf_size=1, metric="haversine") + bt = BallTree64(X, leaf_size=1, metric="haversine") dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") @@ -85,7 +85,7 @@ def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - BallTree(X) + BallTree64(X) def test_bad_pyfunc_metric(): @@ -98,11 +98,11 @@ def one_arg_func(x): X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): - BallTree(X, metric=wrong_returned_value) + BallTree64(X, metric=wrong_returned_value) msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): - BallTree(X, metric=one_arg_func) + BallTree64(X, metric=one_arg_func) @pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) @@ -112,7 +112,7 @@ def test_ball_tree_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) # Test consistency with respect to the `query` method @@ -152,7 +152,7 @@ def test_kernel_density_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) kernel = "gaussian" @@ -176,7 +176,7 @@ def test_two_point_correlation_numerical_consistency(global_random_seed): X_32 = _X.astype(dtype=np.float32, copy=False) Y_32 = _Y.astype(dtype=np.float32, copy=False) - bt_64 = BallTree(X_64, leaf_size=10) + bt_64 = BallTree64(X_64, leaf_size=10) bt_32 = BallTree32(X_32, leaf_size=10) r = np.linspace(0, 1, 10) diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 20a2814f970f3..da70327a2a856 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -8,7 +8,7 @@ ) from sklearn.utils.parallel import delayed, Parallel -from sklearn.neighbors._kd_tree import KDTree, KDTree32 +from sklearn.neighbors._kd_tree import KDTree64, KDTree32 DIMENSION = 3 @@ -19,7 +19,7 @@ def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - KDTree(X) + KDTree64(X) def test_kdtree_picklable_with_joblib(): @@ -28,7 +28,7 @@ def test_kdtree_picklable_with_joblib(): Non-regression test for #21685 and #21228.""" rng = np.random.RandomState(0) X = rng.random_sample((10, 3)) - tree = KDTree(X, leaf_size=2) + tree = KDTree64(X, leaf_size=2) # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that # use to raise "ValueError: buffer source array is read-only" in a previous @@ -43,7 +43,7 @@ def test_kd_tree_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - kd_64 = KDTree(X_64, leaf_size=2, metric=metric, **metric_params) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) # Test consistency with respect to the `query` method @@ -83,7 +83,7 @@ def test_kernel_density_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - kd_64 = KDTree(X_64, leaf_size=2, metric=metric, **metric_params) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) kernel = "gaussian" From 11744c868cb36c692741071db8c7db270acfbb32 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Wed, 7 Jun 2023 10:47:10 +0500 Subject: [PATCH 19/36] Fix imports in test_neighbors_tree.py --- sklearn/neighbors/tests/test_neighbors_tree.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index fca0049669c6a..42548cd9690ee 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -8,15 +8,15 @@ from sklearn.metrics import DistanceMetric from sklearn.neighbors._ball_tree import ( - BallTree, + BallTree64 as BallTree, kernel_norm, - NeighborsHeap as NeighborsHeapBT, + NeighborsHeap64 as NeighborsHeapBT, simultaneous_sort as simultaneous_sort_bt, nodeheap_sort as nodeheap_sort_bt, ) from sklearn.neighbors._kd_tree import ( - KDTree, - NeighborsHeap as NeighborsHeapKDT, + KDTree64 as KDTree, + NeighborsHeap64 as NeighborsHeapKDT, simultaneous_sort as simultaneous_sort_kdt, nodeheap_sort as nodeheap_sort_kdt, ) From 1e24cd9f7f5b5305beb9313f85e96eff1f3b13cb Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Wed, 7 Jun 2023 11:53:17 +0500 Subject: [PATCH 20/36] Correct references to BallTree64 and KDTree64 in doc for neighbors --- doc/modules/neighbors.rst | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst index 90856b6933f3e..5097629e03542 100644 --- a/doc/modules/neighbors.rst +++ b/doc/modules/neighbors.rst @@ -50,7 +50,7 @@ Unsupervised Nearest Neighbors :class:`NearestNeighbors` implements unsupervised nearest neighbors learning. It acts as a uniform interface to three different nearest neighbors -algorithms: :class:`BallTree`, :class:`KDTree`, and a +algorithms: :class:`BallTree64`, :class:`KDTree64`, and a brute-force algorithm based on routines in :mod:`sklearn.metrics.pairwise`. The choice of neighbors search algorithm is controlled through the keyword ``'algorithm'``, which must be one of @@ -116,15 +116,15 @@ unsupervised learning: in particular, see :class:`~sklearn.manifold.Isomap`, KDTree and BallTree Classes --------------------------- -Alternatively, one can use the :class:`KDTree` or :class:`BallTree` classes +Alternatively, one can use the :class:`KDTree64` or :class:`BallTree64` classes directly to find nearest neighbors. This is the functionality wrapped by the :class:`NearestNeighbors` class used above. The Ball Tree and KD Tree have the same interface; we'll show an example of using the KD Tree here: - >>> from sklearn.neighbors import KDTree + >>> from sklearn.neighbors import KDTree64 >>> import numpy as np >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) - >>> kdt = KDTree(X, leaf_size=30, metric='euclidean') + >>> kdt = KDTree64(X, leaf_size=30, metric='euclidean') >>> kdt.query(X, k=2, return_distance=False) array([[0, 1], [1, 0], @@ -133,15 +133,15 @@ have the same interface; we'll show an example of using the KD Tree here: [4, 3], [5, 4]]...) -Refer to the :class:`KDTree` and :class:`BallTree` class documentation +Refer to the :class:`KDTree64` and :class:`BallTree64` class documentation for more information on the options available for nearest neighbors searches, including specification of query strategies, distance metrics, etc. For a list -of valid metrics use :meth:`KDTree.valid_metrics` and :meth:`BallTree.valid_metrics`: +of valid metrics use :meth:`KDTree64.valid_metrics` and :meth:`BallTree64.valid_metrics`: - >>> from sklearn.neighbors import KDTree, BallTree - >>> KDTree.valid_metrics() + >>> from sklearn.neighbors import KDTree64, BallTree64 + >>> KDTree64.valid_metrics() ['euclidean', 'l2', 'minkowski', 'p', 'manhattan', 'cityblock', 'l1', 'chebyshev', 'infinity'] - >>> BallTree.valid_metrics() + >>> BallTree64.valid_metrics() ['euclidean', 'l2', 'minkowski', 'p', 'manhattan', 'cityblock', 'l1', 'chebyshev', 'infinity', 'seuclidean', 'mahalanobis', 'hamming', 'canberra', 'braycurtis', 'jaccard', 'dice', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', 'haversine', 'pyfunc'] .. _classification: @@ -306,7 +306,7 @@ neighbors searches, it becomes inefficient as :math:`D` grows very large: this is one manifestation of the so-called "curse of dimensionality". In scikit-learn, KD tree neighbors searches are specified using the keyword ``algorithm = 'kd_tree'``, and are computed using the class -:class:`KDTree`. +:class:`KDTree64`. .. topic:: References: @@ -344,8 +344,8 @@ a *KD-tree* in high dimensions, though the actual performance is highly dependent on the structure of the training data. In scikit-learn, ball-tree-based neighbors searches are specified using the keyword ``algorithm = 'ball_tree'``, -and are computed using the class :class:`BallTree`. -Alternatively, the user can work with the :class:`BallTree` class directly. +and are computed using the class :class:`BallTree64`. +Alternatively, the user can work with the :class:`BallTree64` class directly. .. topic:: References: @@ -374,7 +374,7 @@ depends on a number of factors: For small data sets (:math:`N` less than 30 or so), :math:`\log(N)` is comparable to :math:`N`, and brute force algorithms can be more efficient - than a tree-based approach. Both :class:`KDTree` and :class:`BallTree` + than a tree-based approach. Both :class:`KDTree64` and :class:`BallTree64` address this through providing a *leaf size* parameter: this controls the number of samples at which a query switches to brute-force. This allows both algorithms to approach the efficiency of a brute-force computation for small @@ -464,7 +464,7 @@ leaf nodes. The level of this switch can be specified with the parameter As ``leaf_size`` increases, the memory required to store a tree structure decreases. This is especially important in the case of ball tree, which stores a :math:`D`-dimensional centroid for each node. The required - storage space for :class:`BallTree` is approximately ``1 / leaf_size`` times + storage space for :class:`BallTree64` is approximately ``1 / leaf_size`` times the size of the training set. ``leaf_size`` is not referenced for brute force queries. From 26ab32235ccaae4685a282db27ee4648e4e4dc6d Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Wed, 7 Jun 2023 12:35:04 +0500 Subject: [PATCH 21/36] Rollback some recent changes --- doc/modules/neighbors.rst | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst index 5097629e03542..90856b6933f3e 100644 --- a/doc/modules/neighbors.rst +++ b/doc/modules/neighbors.rst @@ -50,7 +50,7 @@ Unsupervised Nearest Neighbors :class:`NearestNeighbors` implements unsupervised nearest neighbors learning. It acts as a uniform interface to three different nearest neighbors -algorithms: :class:`BallTree64`, :class:`KDTree64`, and a +algorithms: :class:`BallTree`, :class:`KDTree`, and a brute-force algorithm based on routines in :mod:`sklearn.metrics.pairwise`. The choice of neighbors search algorithm is controlled through the keyword ``'algorithm'``, which must be one of @@ -116,15 +116,15 @@ unsupervised learning: in particular, see :class:`~sklearn.manifold.Isomap`, KDTree and BallTree Classes --------------------------- -Alternatively, one can use the :class:`KDTree64` or :class:`BallTree64` classes +Alternatively, one can use the :class:`KDTree` or :class:`BallTree` classes directly to find nearest neighbors. This is the functionality wrapped by the :class:`NearestNeighbors` class used above. The Ball Tree and KD Tree have the same interface; we'll show an example of using the KD Tree here: - >>> from sklearn.neighbors import KDTree64 + >>> from sklearn.neighbors import KDTree >>> import numpy as np >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) - >>> kdt = KDTree64(X, leaf_size=30, metric='euclidean') + >>> kdt = KDTree(X, leaf_size=30, metric='euclidean') >>> kdt.query(X, k=2, return_distance=False) array([[0, 1], [1, 0], @@ -133,15 +133,15 @@ have the same interface; we'll show an example of using the KD Tree here: [4, 3], [5, 4]]...) -Refer to the :class:`KDTree64` and :class:`BallTree64` class documentation +Refer to the :class:`KDTree` and :class:`BallTree` class documentation for more information on the options available for nearest neighbors searches, including specification of query strategies, distance metrics, etc. For a list -of valid metrics use :meth:`KDTree64.valid_metrics` and :meth:`BallTree64.valid_metrics`: +of valid metrics use :meth:`KDTree.valid_metrics` and :meth:`BallTree.valid_metrics`: - >>> from sklearn.neighbors import KDTree64, BallTree64 - >>> KDTree64.valid_metrics() + >>> from sklearn.neighbors import KDTree, BallTree + >>> KDTree.valid_metrics() ['euclidean', 'l2', 'minkowski', 'p', 'manhattan', 'cityblock', 'l1', 'chebyshev', 'infinity'] - >>> BallTree64.valid_metrics() + >>> BallTree.valid_metrics() ['euclidean', 'l2', 'minkowski', 'p', 'manhattan', 'cityblock', 'l1', 'chebyshev', 'infinity', 'seuclidean', 'mahalanobis', 'hamming', 'canberra', 'braycurtis', 'jaccard', 'dice', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', 'haversine', 'pyfunc'] .. _classification: @@ -306,7 +306,7 @@ neighbors searches, it becomes inefficient as :math:`D` grows very large: this is one manifestation of the so-called "curse of dimensionality". In scikit-learn, KD tree neighbors searches are specified using the keyword ``algorithm = 'kd_tree'``, and are computed using the class -:class:`KDTree64`. +:class:`KDTree`. .. topic:: References: @@ -344,8 +344,8 @@ a *KD-tree* in high dimensions, though the actual performance is highly dependent on the structure of the training data. In scikit-learn, ball-tree-based neighbors searches are specified using the keyword ``algorithm = 'ball_tree'``, -and are computed using the class :class:`BallTree64`. -Alternatively, the user can work with the :class:`BallTree64` class directly. +and are computed using the class :class:`BallTree`. +Alternatively, the user can work with the :class:`BallTree` class directly. .. topic:: References: @@ -374,7 +374,7 @@ depends on a number of factors: For small data sets (:math:`N` less than 30 or so), :math:`\log(N)` is comparable to :math:`N`, and brute force algorithms can be more efficient - than a tree-based approach. Both :class:`KDTree64` and :class:`BallTree64` + than a tree-based approach. Both :class:`KDTree` and :class:`BallTree` address this through providing a *leaf size* parameter: this controls the number of samples at which a query switches to brute-force. This allows both algorithms to approach the efficiency of a brute-force computation for small @@ -464,7 +464,7 @@ leaf nodes. The level of this switch can be specified with the parameter As ``leaf_size`` increases, the memory required to store a tree structure decreases. This is especially important in the case of ball tree, which stores a :math:`D`-dimensional centroid for each node. The required - storage space for :class:`BallTree64` is approximately ``1 / leaf_size`` times + storage space for :class:`BallTree` is approximately ``1 / leaf_size`` times the size of the training set. ``leaf_size`` is not referenced for brute force queries. From 65c3e2b50731e7c820188e5b69462635aae26580 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Wed, 7 Jun 2023 14:42:29 +0500 Subject: [PATCH 22/36] Fix the doc issue by removing extra metric from ball tree --- sklearn/neighbors/_ball_tree.pyx.tp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index 2f9a5eae23243..6d7245cbb2317 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -33,25 +33,24 @@ DOC_DICT{{name_suffix}} = { } VALID_METRICS{{name_suffix}} = [ - 'EuclideanDistance{{name_suffix}}', - 'SEuclideanDistance{{name_suffix}}', - 'ManhattanDistance{{name_suffix}}', + 'BrayCurtisDistance{{name_suffix}}', + 'CanberraDistance{{name_suffix}}', 'ChebyshevDistance{{name_suffix}}', - 'MinkowskiDistance{{name_suffix}}', - 'WMinkowskiDistance{{name_suffix}}', - 'MahalanobisDistance{{name_suffix}}', + 'DiceDistance{{name_suffix}}', + 'EuclideanDistance{{name_suffix}}', 'HammingDistance{{name_suffix}}', - 'CanberraDistance{{name_suffix}}', - 'BrayCurtisDistance{{name_suffix}}', + 'HaversineDistance{{name_suffix}}', 'JaccardDistance{{name_suffix}}', - 'MatchingDistance{{name_suffix}}', - 'DiceDistance{{name_suffix}}', + 'MahalanobisDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}', + 'PyFuncDistance{{name_suffix}}', 'RogersTanimotoDistance{{name_suffix}}', 'RussellRaoDistance{{name_suffix}}', + 'SEuclideanDistance{{name_suffix}}', 'SokalMichenerDistance{{name_suffix}}', 'SokalSneathDistance{{name_suffix}}', - 'PyFuncDistance{{name_suffix}}', - 'HaversineDistance{{name_suffix}}', + 'WMinkowskiDistance{{name_suffix}}', ] {{endfor}} From 9027a9206e8d1a393b750985b0403f9324d46d54 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Tue, 13 Jun 2023 12:48:17 +0500 Subject: [PATCH 23/36] Change structure - Use a BallTree and KDTree class --- sklearn/neighbors/__init__.py | 4 +- sklearn/neighbors/_ball_tree.pyx.tp | 108 ++++++++++++++++- sklearn/neighbors/_base.py | 8 +- sklearn/neighbors/_binary_tree.pxi.tp | 2 +- sklearn/neighbors/_kd_tree.pyx.tp | 110 +++++++++++++++++- sklearn/neighbors/_kde.py | 4 +- sklearn/neighbors/tests/test_ball_tree.py | 26 ++--- .../neighbors/tests/test_neighbors_tree.py | 4 +- 8 files changed, 239 insertions(+), 27 deletions(-) diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 13abefc6298a3..8223c20991904 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -3,8 +3,8 @@ algorithm. """ -from ._ball_tree import BallTree64 as BallTree -from ._kd_tree import KDTree64 as KDTree +from ._ball_tree import BallTree +from ._kd_tree import KDTree from ._graph import kneighbors_graph, radius_neighbors_graph from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer from ._unsupervised import NearestNeighbors diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index 6d7245cbb2317..da896f826441f 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -23,7 +23,7 @@ implementation_specific_values = [ }} -__all__ = ['BallTree64', 'BallTree32'] +__all__ = ['BallTree'] {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -283,3 +283,109 @@ cdef inline float64_t max_rdist_dual{{name_suffix}}( ) {{endfor}} + + +cdef class BallTree: + """ + Definition for the BallTree class that any underlying python estimators + are using. Within this class the respective float64 or float32 variants + are called depending on the data type of input data. This is done by + initializing BallTree64 or BallTree32 respectively as a private variable + and then calling any required public methods through this variable. + """ + + cdef object __ball_tree + + def __init__( + self, + data, + leaf_size=40, + metric='minkowski', + sample_weight=None, + **kwargs, + ): + dtype = data.dtype + if dtype == np.float64: + specialized_class = BallTree64 + elif dtype == np.float32: + specialized_class = BallTree32 + else: + raise ValueError( + f"Unexpected dtype {dtype} provided. Please select a dtype from" + " {np.float32, np.float64}" + ) + + self.__ball_tree = specialized_class( + data=data, + leaf_size=leaf_size, + metric=metric, + sample_weight=sample_weight, + **kwargs, + ) + + @classmethod + def valid_metrics(cls): + # The metric ids/names are the same whether we consider + # BallTree64 or BallTree32. So any of the two can be used. + return BallTree64.valid_metrics() + + def query( + self, + X, + k=1, + return_distance=True, + dualtree=False, + breadth_first=False, + sort_results=True, + ): + return self.__ball_tree.query( + X=X, + k=k, + return_distance=return_distance, + dualtree=dualtree, + breadth_first=breadth_first, + sort_results=sort_results, + ) + + def query_radius( + self, + X, + r, + return_distance=False, + count_only=False, + sort_results=False, + ): + return self.__ball_tree.query_radius( + X=X, + r=r, + return_distance=return_distance, + count_only=count_only, + sort_results=sort_results, + ) + + def kernel_density( + self, + X, + h, + kernel='gaussian', + atol=0, + rtol=1E-8, + breadth_first=True, + return_log=False, + ): + return self.__ball_tree.kernel_density( + X=X, + h=h, + kernel=kernel, + atol=atol, + rtol=rtol, + breadth_first=breadth_first, + return_log=return_log, + ) + + def two_point_correlation(self, X, r, dualtree=False): + return self.__ball_tree.two_point_correlation( + X=X, + r=r, + dualtree=dualtree, + ) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index f57904f1380ef..82626d6705bb4 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -18,8 +18,8 @@ from scipy.sparse import csr_matrix, issparse from joblib import effective_n_jobs -from ._ball_tree import BallTree64 as BallTree -from ._kd_tree import KDTree64 as KDTree +from ._ball_tree import BallTree +from ._kd_tree import KDTree from ..base import BaseEstimator, MultiOutputMixin from ..base import is_classifier from ..metrics import pairwise_distances_chunked @@ -68,8 +68,8 @@ SCIPY_METRICS += ["matching"] VALID_METRICS = dict( - ball_tree=BallTree._valid_metrics, - kd_tree=KDTree._valid_metrics, + ball_tree=BallTree.valid_metrics(), + kd_tree=KDTree.valid_metrics(), # The following list comes from the # sklearn.metrics.pairwise doc string brute=sorted(set(PAIRWISE_DISTANCE_FUNCTIONS).union(SCIPY_METRICS)), diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index e3fe9cab5894b..676ffead07145 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -2409,7 +2409,7 @@ cdef class BinaryTree{{name_suffix}}: {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} -def load_heap(float64_t[:, ::1] X, intp_t k): +def load_heap{{name_suffix}}({{INPUT_DTYPE_t}}[:, ::1] X, intp_t k): """test fully loading the heap""" assert k <= X.shape[1] cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(X.shape[0], k) diff --git a/sklearn/neighbors/_kd_tree.pyx.tp b/sklearn/neighbors/_kd_tree.pyx.tp index 000da915122a6..4f17b57bacacd 100644 --- a/sklearn/neighbors/_kd_tree.pyx.tp +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -1,6 +1,6 @@ {{py: -# Generated file: _ball_tree.pyx +# Generated file: _kd_tree.pyx implementation_specific_values = [ # The values are arranged as follows: @@ -24,7 +24,7 @@ implementation_specific_values = [ }} -__all__ = ['KDTree64', 'KDTree32'] +__all__ = ['KDTree'] {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -335,3 +335,109 @@ cdef inline float64_t max_dist_dual{{name_suffix}}( ) {{endfor}} + + +cdef class KDTree: + """ + Definition for the KDTree class that any underlying python estimators + are using. Within this class the respective float64 or float32 variants + are called depending on the data type of input data. This is done by + initializing KDTree64 or KDTree32 respectively as a private variable + and then calling any required public methods through this variable. + """ + + cdef object __kd_tree + + def __init__( + self, + data, + leaf_size=40, + metric='minkowski', + sample_weight=None, + **kwargs, + ): + dtype = data.dtype + if dtype == np.float64: + specialized_class = KDTree64 + elif dtype == np.float32: + specialized_class = KDTree32 + else: + raise ValueError( + f"Unexpected dtype {dtype} provided. Please select a dtype from" + " {np.float32, np.float64}" + ) + + self.__kd_tree = specialized_class( + data=data, + leaf_size=leaf_size, + metric=metric, + sample_weight=sample_weight, + **kwargs, + ) + + @classmethod + def valid_metrics(cls): + # The metric ids/names are the same whether we consider + # KDTree64 or KDTree32. So any of the two can be used. + return KDTree64.valid_metrics() + + def query( + self, + X, + k=1, + return_distance=True, + dualtree=False, + breadth_first=False, + sort_results=True, + ): + return self.__kd_tree.query( + X=X, + k=k, + return_distance=return_distance, + dualtree=dualtree, + breadth_first=breadth_first, + sort_results=sort_results, + ) + + def query_radius( + self, + X, + r, + return_distance=False, + count_only=False, + sort_results=False, + ): + return self.__kd_tree.query_radius( + X=X, + r=r, + return_distance=return_distance, + count_only=count_only, + sort_results=sort_results, + ) + + def kernel_density( + self, + X, + h, + kernel='gaussian', + atol=0, + rtol=1E-8, + breadth_first=True, + return_log=False, + ): + return self.__kd_tree.kernel_density( + X=X, + h=h, + kernel=kernel, + atol=atol, + rtol=rtol, + breadth_first=breadth_first, + return_log=return_log, + ) + + def two_point_correlation(self, X, r, dualtree=False): + return self.__kd_tree.two_point_correlation( + X=X, + r=r, + dualtree=dualtree, + ) diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 74dc486f17b35..f285b03403b5f 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -15,8 +15,8 @@ from ..utils.validation import _check_sample_weight, check_is_fitted from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import row_norms -from ._ball_tree import BallTree64 as BallTree -from ._kd_tree import KDTree64 as KDTree +from ._ball_tree import BallTree +from ._kd_tree import KDTree VALID_KERNELS = [ diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index f5c9e1986a6c1..e2b78f6d23228 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -3,7 +3,7 @@ import numpy as np import pytest from numpy.testing import assert_array_almost_equal, assert_allclose, assert_equal -from sklearn.neighbors._ball_tree import BallTree64, BallTree32 +from sklearn.neighbors._ball_tree import BallTree from sklearn.utils import check_random_state from sklearn.utils.validation import check_array from sklearn.utils._testing import _convert_container @@ -64,7 +64,7 @@ def test_ball_tree_query_metrics(metric, array_type): k = 5 - bt = BallTree64(X, leaf_size=1, metric=metric) + bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2) @@ -73,7 +73,7 @@ def test_ball_tree_query_metrics(metric, array_type): def test_query_haversine(): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) - bt = BallTree64(X, leaf_size=1, metric="haversine") + bt = BallTree(X, leaf_size=1, metric="haversine") dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") @@ -84,8 +84,8 @@ def test_query_haversine(): def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) - with pytest.raises(ValueError, match="setting an array element with a sequence"): - BallTree64(X) + with pytest.raises(ValueError, match="Unexpected dtype object provided"): + BallTree(X) def test_bad_pyfunc_metric(): @@ -98,11 +98,11 @@ def one_arg_func(x): X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): - BallTree64(X, metric=wrong_returned_value) + BallTree(X, metric=wrong_returned_value) msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): - BallTree64(X, metric=one_arg_func) + BallTree(X, metric=one_arg_func) @pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) @@ -112,8 +112,8 @@ def test_ball_tree_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) - bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree(X_32, leaf_size=1, metric=metric, **metric_params) # Test consistency with respect to the `query` method k = 5 @@ -152,8 +152,8 @@ def test_kernel_density_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) - bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree(X_32, leaf_size=1, metric=metric, **metric_params) kernel = "gaussian" h = 0.1 @@ -176,8 +176,8 @@ def test_two_point_correlation_numerical_consistency(global_random_seed): X_32 = _X.astype(dtype=np.float32, copy=False) Y_32 = _Y.astype(dtype=np.float32, copy=False) - bt_64 = BallTree64(X_64, leaf_size=10) - bt_32 = BallTree32(X_32, leaf_size=10) + bt_64 = BallTree(X_64, leaf_size=10) + bt_32 = BallTree(X_32, leaf_size=10) r = np.linspace(0, 1, 10) diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index 42548cd9690ee..ce9fb11cf2510 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -8,14 +8,14 @@ from sklearn.metrics import DistanceMetric from sklearn.neighbors._ball_tree import ( - BallTree64 as BallTree, + BallTree, kernel_norm, NeighborsHeap64 as NeighborsHeapBT, simultaneous_sort as simultaneous_sort_bt, nodeheap_sort as nodeheap_sort_bt, ) from sklearn.neighbors._kd_tree import ( - KDTree64 as KDTree, + KDTree, NeighborsHeap64 as NeighborsHeapKDT, simultaneous_sort as simultaneous_sort_kdt, nodeheap_sort as nodeheap_sort_kdt, From 292a3f000e194067031c2ba6cc32b75f7a5ad20c Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Tue, 13 Jun 2023 14:02:42 +0500 Subject: [PATCH 24/36] Fix tests and related issues --- sklearn/neighbors/_ball_tree.pyx.tp | 21 ++++++++++----------- sklearn/neighbors/_kd_tree.pyx.tp | 16 ++++++++-------- sklearn/neighbors/tests/test_kd_tree.py | 16 ++++++++-------- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index da896f826441f..3a32c1e7b7197 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -294,7 +294,7 @@ cdef class BallTree: and then calling any required public methods through this variable. """ - cdef object __ball_tree + cdef object _ball_tree def __init__( self, @@ -304,18 +304,17 @@ cdef class BallTree: sample_weight=None, **kwargs, ): - dtype = data.dtype - if dtype == np.float64: + if isinstance(data, list) or data.dtype == np.float64: specialized_class = BallTree64 - elif dtype == np.float32: + elif data.dtype == np.float32: specialized_class = BallTree32 else: raise ValueError( - f"Unexpected dtype {dtype} provided. Please select a dtype from" - " {np.float32, np.float64}" + f"Unexpected dtype {data.dtype} provided. Please select a" + " dtype from {np.float32, np.float64}" ) - self.__ball_tree = specialized_class( + self._ball_tree = specialized_class( data=data, leaf_size=leaf_size, metric=metric, @@ -338,7 +337,7 @@ cdef class BallTree: breadth_first=False, sort_results=True, ): - return self.__ball_tree.query( + return self._ball_tree.query( X=X, k=k, return_distance=return_distance, @@ -355,7 +354,7 @@ cdef class BallTree: count_only=False, sort_results=False, ): - return self.__ball_tree.query_radius( + return self._ball_tree.query_radius( X=X, r=r, return_distance=return_distance, @@ -373,7 +372,7 @@ cdef class BallTree: breadth_first=True, return_log=False, ): - return self.__ball_tree.kernel_density( + return self._ball_tree.kernel_density( X=X, h=h, kernel=kernel, @@ -384,7 +383,7 @@ cdef class BallTree: ) def two_point_correlation(self, X, r, dualtree=False): - return self.__ball_tree.two_point_correlation( + return self._ball_tree.two_point_correlation( X=X, r=r, dualtree=dualtree, diff --git a/sklearn/neighbors/_kd_tree.pyx.tp b/sklearn/neighbors/_kd_tree.pyx.tp index 4f17b57bacacd..6ac496dfdd53e 100644 --- a/sklearn/neighbors/_kd_tree.pyx.tp +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -346,7 +346,7 @@ cdef class KDTree: and then calling any required public methods through this variable. """ - cdef object __kd_tree + cdef object _kd_tree def __init__( self, @@ -357,9 +357,9 @@ cdef class KDTree: **kwargs, ): dtype = data.dtype - if dtype == np.float64: + if data.dtype == np.float64: specialized_class = KDTree64 - elif dtype == np.float32: + elif data.dtype == np.float32: specialized_class = KDTree32 else: raise ValueError( @@ -367,7 +367,7 @@ cdef class KDTree: " {np.float32, np.float64}" ) - self.__kd_tree = specialized_class( + self._kd_tree = specialized_class( data=data, leaf_size=leaf_size, metric=metric, @@ -390,7 +390,7 @@ cdef class KDTree: breadth_first=False, sort_results=True, ): - return self.__kd_tree.query( + return self._kd_tree.query( X=X, k=k, return_distance=return_distance, @@ -407,7 +407,7 @@ cdef class KDTree: count_only=False, sort_results=False, ): - return self.__kd_tree.query_radius( + return self._kd_tree.query_radius( X=X, r=r, return_distance=return_distance, @@ -425,7 +425,7 @@ cdef class KDTree: breadth_first=True, return_log=False, ): - return self.__kd_tree.kernel_density( + return self._kd_tree.kernel_density( X=X, h=h, kernel=kernel, @@ -436,7 +436,7 @@ cdef class KDTree: ) def two_point_correlation(self, X, r, dualtree=False): - return self.__kd_tree.two_point_correlation( + return self._kd_tree.two_point_correlation( X=X, r=r, dualtree=dualtree, diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index da70327a2a856..71000cf0d1813 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -8,7 +8,7 @@ ) from sklearn.utils.parallel import delayed, Parallel -from sklearn.neighbors._kd_tree import KDTree64, KDTree32 +from sklearn.neighbors._kd_tree import KDTree DIMENSION = 3 @@ -18,8 +18,8 @@ def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) - with pytest.raises(ValueError, match="setting an array element with a sequence"): - KDTree64(X) + with pytest.raises(ValueError, match="Unexpected dtype object provided"): + KDTree(X) def test_kdtree_picklable_with_joblib(): @@ -28,7 +28,7 @@ def test_kdtree_picklable_with_joblib(): Non-regression test for #21685 and #21228.""" rng = np.random.RandomState(0) X = rng.random_sample((10, 3)) - tree = KDTree64(X, leaf_size=2) + tree = KDTree(X, leaf_size=2) # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that # use to raise "ValueError: buffer source array is read-only" in a previous @@ -43,8 +43,8 @@ def test_kd_tree_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) - kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + kd_64 = KDTree(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree(X_32, leaf_size=2, metric=metric, **metric_params) # Test consistency with respect to the `query` method k = 4 @@ -83,8 +83,8 @@ def test_kernel_density_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) - kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + kd_64 = KDTree(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree(X_32, leaf_size=2, metric=metric, **metric_params) kernel = "gaussian" h = 0.1 From 0a5fea29908891bb326bdb690706d11966be75dd Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Tue, 13 Jun 2023 16:06:03 +0500 Subject: [PATCH 25/36] Further fixes and use properties to access attributes of specialized classes --- sklearn/neighbors/_ball_tree.pyx.tp | 21 ++++++++++++++------- sklearn/neighbors/_kd_tree.pyx.tp | 22 ++++++++++++++-------- sklearn/neighbors/tests/test_ball_tree.py | 2 +- sklearn/neighbors/tests/test_kd_tree.py | 2 +- 4 files changed, 30 insertions(+), 17 deletions(-) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index 3a32c1e7b7197..c1ea61fbc8895 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -304,15 +304,10 @@ cdef class BallTree: sample_weight=None, **kwargs, ): - if isinstance(data, list) or data.dtype == np.float64: - specialized_class = BallTree64 - elif data.dtype == np.float32: + if isinstance(data, np.ndarray) and data.dtype == np.float32: specialized_class = BallTree32 else: - raise ValueError( - f"Unexpected dtype {data.dtype} provided. Please select a" - " dtype from {np.float32, np.float64}" - ) + specialized_class = BallTree64 self._ball_tree = specialized_class( data=data, @@ -322,6 +317,18 @@ cdef class BallTree: **kwargs, ) + @property + def data(self): + return self._ball_tree.data + + @property + def sample_weight(self): + return self._ball_tree.sample_weight + + @property + def sum_weight(self): + return self._ball_tree.sum_weight + @classmethod def valid_metrics(cls): # The metric ids/names are the same whether we consider diff --git a/sklearn/neighbors/_kd_tree.pyx.tp b/sklearn/neighbors/_kd_tree.pyx.tp index 6ac496dfdd53e..6eaadf849f8c7 100644 --- a/sklearn/neighbors/_kd_tree.pyx.tp +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -356,16 +356,10 @@ cdef class KDTree: sample_weight=None, **kwargs, ): - dtype = data.dtype - if data.dtype == np.float64: - specialized_class = KDTree64 - elif data.dtype == np.float32: + if isinstance(data, np.ndarray) and data.dtype == np.float32: specialized_class = KDTree32 else: - raise ValueError( - f"Unexpected dtype {dtype} provided. Please select a dtype from" - " {np.float32, np.float64}" - ) + specialized_class = KDTree64 self._kd_tree = specialized_class( data=data, @@ -375,6 +369,18 @@ cdef class KDTree: **kwargs, ) + @property + def data(self): + return self._kd_tree.data + + @property + def sample_weight(self): + return self._kd_tree.sample_weight + + @property + def sum_weight(self): + return self._kd_tree.sum_weight + @classmethod def valid_metrics(cls): # The metric ids/names are the same whether we consider diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index e2b78f6d23228..53321ae2486c9 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -84,7 +84,7 @@ def test_query_haversine(): def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) - with pytest.raises(ValueError, match="Unexpected dtype object provided"): + with pytest.raises(ValueError, match="setting an array element with a sequence"): BallTree(X) diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 71000cf0d1813..c2404914a0d8e 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -18,7 +18,7 @@ def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) - with pytest.raises(ValueError, match="Unexpected dtype object provided"): + with pytest.raises(ValueError, match="setting an array element with a sequence"): KDTree(X) From 810b1ed81cdf2af4c0ffcb24cbc304f8730adf28 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Tue, 13 Jun 2023 18:03:28 +0500 Subject: [PATCH 26/36] In _estimate_mi in mutual_info if y is continuous set it as np.float64 similar to X --- sklearn/feature_selection/_mutual_info.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/feature_selection/_mutual_info.py b/sklearn/feature_selection/_mutual_info.py index 9cacfc3890784..e955b744130a8 100644 --- a/sklearn/feature_selection/_mutual_info.py +++ b/sklearn/feature_selection/_mutual_info.py @@ -297,6 +297,7 @@ def _estimate_mi( if not discrete_target: y = scale(y, with_mean=False) + y = y.astype(np.float64, copy=False) y += ( 1e-10 * np.maximum(1, np.mean(np.abs(y))) From b294a113397ca4157d57e0f37eccf7fb09a7daec Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 19 Jun 2023 17:20:20 +0500 Subject: [PATCH 27/36] Revert new structure --- sklearn/feature_selection/_mutual_info.py | 1 - sklearn/neighbors/__init__.py | 4 +- sklearn/neighbors/_ball_tree.pyx.tp | 114 +----------------- sklearn/neighbors/_base.py | 8 +- sklearn/neighbors/_kd_tree.pyx.tp | 114 +----------------- sklearn/neighbors/_kde.py | 4 +- sklearn/neighbors/tests/test_ball_tree.py | 24 ++-- sklearn/neighbors/tests/test_kd_tree.py | 14 +-- .../neighbors/tests/test_neighbors_tree.py | 4 +- 9 files changed, 31 insertions(+), 256 deletions(-) diff --git a/sklearn/feature_selection/_mutual_info.py b/sklearn/feature_selection/_mutual_info.py index e955b744130a8..9cacfc3890784 100644 --- a/sklearn/feature_selection/_mutual_info.py +++ b/sklearn/feature_selection/_mutual_info.py @@ -297,7 +297,6 @@ def _estimate_mi( if not discrete_target: y = scale(y, with_mean=False) - y = y.astype(np.float64, copy=False) y += ( 1e-10 * np.maximum(1, np.mean(np.abs(y))) diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 8223c20991904..13abefc6298a3 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -3,8 +3,8 @@ algorithm. """ -from ._ball_tree import BallTree -from ._kd_tree import KDTree +from ._ball_tree import BallTree64 as BallTree +from ._kd_tree import KDTree64 as KDTree from ._graph import kneighbors_graph, radius_neighbors_graph from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer from ._unsupervised import NearestNeighbors diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index c1ea61fbc8895..6d7245cbb2317 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -23,7 +23,7 @@ implementation_specific_values = [ }} -__all__ = ['BallTree'] +__all__ = ['BallTree64', 'BallTree32'] {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -283,115 +283,3 @@ cdef inline float64_t max_rdist_dual{{name_suffix}}( ) {{endfor}} - - -cdef class BallTree: - """ - Definition for the BallTree class that any underlying python estimators - are using. Within this class the respective float64 or float32 variants - are called depending on the data type of input data. This is done by - initializing BallTree64 or BallTree32 respectively as a private variable - and then calling any required public methods through this variable. - """ - - cdef object _ball_tree - - def __init__( - self, - data, - leaf_size=40, - metric='minkowski', - sample_weight=None, - **kwargs, - ): - if isinstance(data, np.ndarray) and data.dtype == np.float32: - specialized_class = BallTree32 - else: - specialized_class = BallTree64 - - self._ball_tree = specialized_class( - data=data, - leaf_size=leaf_size, - metric=metric, - sample_weight=sample_weight, - **kwargs, - ) - - @property - def data(self): - return self._ball_tree.data - - @property - def sample_weight(self): - return self._ball_tree.sample_weight - - @property - def sum_weight(self): - return self._ball_tree.sum_weight - - @classmethod - def valid_metrics(cls): - # The metric ids/names are the same whether we consider - # BallTree64 or BallTree32. So any of the two can be used. - return BallTree64.valid_metrics() - - def query( - self, - X, - k=1, - return_distance=True, - dualtree=False, - breadth_first=False, - sort_results=True, - ): - return self._ball_tree.query( - X=X, - k=k, - return_distance=return_distance, - dualtree=dualtree, - breadth_first=breadth_first, - sort_results=sort_results, - ) - - def query_radius( - self, - X, - r, - return_distance=False, - count_only=False, - sort_results=False, - ): - return self._ball_tree.query_radius( - X=X, - r=r, - return_distance=return_distance, - count_only=count_only, - sort_results=sort_results, - ) - - def kernel_density( - self, - X, - h, - kernel='gaussian', - atol=0, - rtol=1E-8, - breadth_first=True, - return_log=False, - ): - return self._ball_tree.kernel_density( - X=X, - h=h, - kernel=kernel, - atol=atol, - rtol=rtol, - breadth_first=breadth_first, - return_log=return_log, - ) - - def two_point_correlation(self, X, r, dualtree=False): - return self._ball_tree.two_point_correlation( - X=X, - r=r, - dualtree=dualtree, - ) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 82626d6705bb4..f57904f1380ef 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -18,8 +18,8 @@ from scipy.sparse import csr_matrix, issparse from joblib import effective_n_jobs -from ._ball_tree import BallTree -from ._kd_tree import KDTree +from ._ball_tree import BallTree64 as BallTree +from ._kd_tree import KDTree64 as KDTree from ..base import BaseEstimator, MultiOutputMixin from ..base import is_classifier from ..metrics import pairwise_distances_chunked @@ -68,8 +68,8 @@ SCIPY_METRICS += ["matching"] VALID_METRICS = dict( - ball_tree=BallTree.valid_metrics(), - kd_tree=KDTree.valid_metrics(), + ball_tree=BallTree._valid_metrics, + kd_tree=KDTree._valid_metrics, # The following list comes from the # sklearn.metrics.pairwise doc string brute=sorted(set(PAIRWISE_DISTANCE_FUNCTIONS).union(SCIPY_METRICS)), diff --git a/sklearn/neighbors/_kd_tree.pyx.tp b/sklearn/neighbors/_kd_tree.pyx.tp index 6eaadf849f8c7..b684219f6b9a7 100644 --- a/sklearn/neighbors/_kd_tree.pyx.tp +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -24,7 +24,7 @@ implementation_specific_values = [ }} -__all__ = ['KDTree'] +__all__ = ['KDTree64', 'KDTree32'] {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -335,115 +335,3 @@ cdef inline float64_t max_dist_dual{{name_suffix}}( ) {{endfor}} - - -cdef class KDTree: - """ - Definition for the KDTree class that any underlying python estimators - are using. Within this class the respective float64 or float32 variants - are called depending on the data type of input data. This is done by - initializing KDTree64 or KDTree32 respectively as a private variable - and then calling any required public methods through this variable. - """ - - cdef object _kd_tree - - def __init__( - self, - data, - leaf_size=40, - metric='minkowski', - sample_weight=None, - **kwargs, - ): - if isinstance(data, np.ndarray) and data.dtype == np.float32: - specialized_class = KDTree32 - else: - specialized_class = KDTree64 - - self._kd_tree = specialized_class( - data=data, - leaf_size=leaf_size, - metric=metric, - sample_weight=sample_weight, - **kwargs, - ) - - @property - def data(self): - return self._kd_tree.data - - @property - def sample_weight(self): - return self._kd_tree.sample_weight - - @property - def sum_weight(self): - return self._kd_tree.sum_weight - - @classmethod - def valid_metrics(cls): - # The metric ids/names are the same whether we consider - # KDTree64 or KDTree32. So any of the two can be used. - return KDTree64.valid_metrics() - - def query( - self, - X, - k=1, - return_distance=True, - dualtree=False, - breadth_first=False, - sort_results=True, - ): - return self._kd_tree.query( - X=X, - k=k, - return_distance=return_distance, - dualtree=dualtree, - breadth_first=breadth_first, - sort_results=sort_results, - ) - - def query_radius( - self, - X, - r, - return_distance=False, - count_only=False, - sort_results=False, - ): - return self._kd_tree.query_radius( - X=X, - r=r, - return_distance=return_distance, - count_only=count_only, - sort_results=sort_results, - ) - - def kernel_density( - self, - X, - h, - kernel='gaussian', - atol=0, - rtol=1E-8, - breadth_first=True, - return_log=False, - ): - return self._kd_tree.kernel_density( - X=X, - h=h, - kernel=kernel, - atol=atol, - rtol=rtol, - breadth_first=breadth_first, - return_log=return_log, - ) - - def two_point_correlation(self, X, r, dualtree=False): - return self._kd_tree.two_point_correlation( - X=X, - r=r, - dualtree=dualtree, - ) diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index f285b03403b5f..74dc486f17b35 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -15,8 +15,8 @@ from ..utils.validation import _check_sample_weight, check_is_fitted from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import row_norms -from ._ball_tree import BallTree -from ._kd_tree import KDTree +from ._ball_tree import BallTree64 as BallTree +from ._kd_tree import KDTree64 as KDTree VALID_KERNELS = [ diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 53321ae2486c9..f5c9e1986a6c1 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -3,7 +3,7 @@ import numpy as np import pytest from numpy.testing import assert_array_almost_equal, assert_allclose, assert_equal -from sklearn.neighbors._ball_tree import BallTree +from sklearn.neighbors._ball_tree import BallTree64, BallTree32 from sklearn.utils import check_random_state from sklearn.utils.validation import check_array from sklearn.utils._testing import _convert_container @@ -64,7 +64,7 @@ def test_ball_tree_query_metrics(metric, array_type): k = 5 - bt = BallTree(X, leaf_size=1, metric=metric) + bt = BallTree64(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2) @@ -73,7 +73,7 @@ def test_ball_tree_query_metrics(metric, array_type): def test_query_haversine(): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) - bt = BallTree(X, leaf_size=1, metric="haversine") + bt = BallTree64(X, leaf_size=1, metric="haversine") dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") @@ -85,7 +85,7 @@ def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - BallTree(X) + BallTree64(X) def test_bad_pyfunc_metric(): @@ -98,11 +98,11 @@ def one_arg_func(x): X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): - BallTree(X, metric=wrong_returned_value) + BallTree64(X, metric=wrong_returned_value) msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): - BallTree(X, metric=one_arg_func) + BallTree64(X, metric=one_arg_func) @pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) @@ -112,8 +112,8 @@ def test_ball_tree_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) - bt_32 = BallTree(X_32, leaf_size=1, metric=metric, **metric_params) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) # Test consistency with respect to the `query` method k = 5 @@ -152,8 +152,8 @@ def test_kernel_density_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - bt_64 = BallTree(X_64, leaf_size=1, metric=metric, **metric_params) - bt_32 = BallTree(X_32, leaf_size=1, metric=metric, **metric_params) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) kernel = "gaussian" h = 0.1 @@ -176,8 +176,8 @@ def test_two_point_correlation_numerical_consistency(global_random_seed): X_32 = _X.astype(dtype=np.float32, copy=False) Y_32 = _Y.astype(dtype=np.float32, copy=False) - bt_64 = BallTree(X_64, leaf_size=10) - bt_32 = BallTree(X_32, leaf_size=10) + bt_64 = BallTree64(X_64, leaf_size=10) + bt_32 = BallTree32(X_32, leaf_size=10) r = np.linspace(0, 1, 10) diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index c2404914a0d8e..da70327a2a856 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -8,7 +8,7 @@ ) from sklearn.utils.parallel import delayed, Parallel -from sklearn.neighbors._kd_tree import KDTree +from sklearn.neighbors._kd_tree import KDTree64, KDTree32 DIMENSION = 3 @@ -19,7 +19,7 @@ def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - KDTree(X) + KDTree64(X) def test_kdtree_picklable_with_joblib(): @@ -28,7 +28,7 @@ def test_kdtree_picklable_with_joblib(): Non-regression test for #21685 and #21228.""" rng = np.random.RandomState(0) X = rng.random_sample((10, 3)) - tree = KDTree(X, leaf_size=2) + tree = KDTree64(X, leaf_size=2) # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that # use to raise "ValueError: buffer source array is read-only" in a previous @@ -43,8 +43,8 @@ def test_kd_tree_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - kd_64 = KDTree(X_64, leaf_size=2, metric=metric, **metric_params) - kd_32 = KDTree(X_32, leaf_size=2, metric=metric, **metric_params) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) # Test consistency with respect to the `query` method k = 4 @@ -83,8 +83,8 @@ def test_kernel_density_numerical_consistency(global_random_seed, metric): ) metric_params = METRICS.get(metric, {}) - kd_64 = KDTree(X_64, leaf_size=2, metric=metric, **metric_params) - kd_32 = KDTree(X_32, leaf_size=2, metric=metric, **metric_params) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) kernel = "gaussian" h = 0.1 diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index ce9fb11cf2510..42548cd9690ee 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -8,14 +8,14 @@ from sklearn.metrics import DistanceMetric from sklearn.neighbors._ball_tree import ( - BallTree, + BallTree64 as BallTree, kernel_norm, NeighborsHeap64 as NeighborsHeapBT, simultaneous_sort as simultaneous_sort_bt, nodeheap_sort as nodeheap_sort_bt, ) from sklearn.neighbors._kd_tree import ( - KDTree, + KDTree64 as KDTree, NeighborsHeap64 as NeighborsHeapKDT, simultaneous_sort as simultaneous_sort_kdt, nodeheap_sort as nodeheap_sort_kdt, From 0527e77d0ebe23df71c6ad9bc9b84a657e5dc5ff Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 19 Jun 2023 17:50:15 +0500 Subject: [PATCH 28/36] Parameterize applicable tests over 64 and 32 class versions --- sklearn/neighbors/tests/test_ball_tree.py | 29 +++++++++++++++-------- sklearn/neighbors/tests/test_kd_tree.py | 15 ++++++++---- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index f5c9e1986a6c1..ab9153c05afae 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -38,6 +38,11 @@ "sokalsneath", ] +BALL_TREE_CLASSES = [ + BallTree64, + BallTree32, +] + def brute_force_neighbors(X, Y, k, metric, **kwargs): from sklearn.metrics import DistanceMetric @@ -51,7 +56,8 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): @pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) @pytest.mark.parametrize("array_type", ["list", "array"]) -def test_ball_tree_query_metrics(metric, array_type): +@pytest.mark.parametrize("ball_tree_class", BALL_TREE_CLASSES) +def test_ball_tree_query_metrics(metric, array_type, ball_tree_class): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) @@ -64,31 +70,34 @@ def test_ball_tree_query_metrics(metric, array_type): k = 5 - bt = BallTree64(X, leaf_size=1, metric=metric) + bt = ball_tree_class(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2) -def test_query_haversine(): +@pytest.mark.parametrize("ball_tree_class, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5])) +def test_query_haversine(ball_tree_class, decimal_tol): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) - bt = BallTree64(X, leaf_size=1, metric="haversine") + bt = ball_tree_class(X, leaf_size=1, metric="haversine") dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") - assert_array_almost_equal(dist1, dist2) + assert_array_almost_equal(dist1, dist2, decimal=decimal_tol) assert_array_almost_equal(ind1, ind2) -def test_array_object_type(): +@pytest.mark.parametrize("ball_tree_class", BALL_TREE_CLASSES) +def test_array_object_type(ball_tree_class): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - BallTree64(X) + ball_tree_class(X) -def test_bad_pyfunc_metric(): +@pytest.mark.parametrize("ball_tree_class", BALL_TREE_CLASSES) +def test_bad_pyfunc_metric(ball_tree_class): def wrong_returned_value(x, y): return "1" @@ -98,11 +107,11 @@ def one_arg_func(x): X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): - BallTree64(X, metric=wrong_returned_value) + ball_tree_class(X, metric=wrong_returned_value) msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): - BallTree64(X, metric=one_arg_func) + ball_tree_class(X, metric=one_arg_func) @pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index da70327a2a856..1f4bbb3a5a6a6 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -14,21 +14,28 @@ METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)} +KD_TREE_CLASSES = [ + KDTree64, + KDTree64, +] -def test_array_object_type(): + +@pytest.mark.parametrize("kd_tree_class", KD_TREE_CLASSES) +def test_array_object_type(kd_tree_class): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - KDTree64(X) + kd_tree_class(X) -def test_kdtree_picklable_with_joblib(): +@pytest.mark.parametrize("kd_tree_class", KD_TREE_CLASSES) +def test_kdtree_picklable_with_joblib(kd_tree_class): """Make sure that KDTree queries work when joblib memmaps. Non-regression test for #21685 and #21228.""" rng = np.random.RandomState(0) X = rng.random_sample((10, 3)) - tree = KDTree64(X, leaf_size=2) + tree = kd_tree_class(X, leaf_size=2) # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that # use to raise "ValueError: buffer source array is read-only" in a previous From 738724e242e9246dfae9855257a75a328a177fbb Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Thu, 22 Jun 2023 21:45:24 +0500 Subject: [PATCH 29/36] Address all PR suggestions --- doc/whats_new/v1.3.rst | 4 --- sklearn/neighbors/_ball_tree.pyx.tp | 6 ----- sklearn/neighbors/_binary_tree.pxi.tp | 33 ++++++----------------- sklearn/neighbors/_kd_tree.pyx.tp | 6 ----- sklearn/neighbors/tests/test_ball_tree.py | 2 ++ sklearn/neighbors/tests/test_kd_tree.py | 16 ++++++----- 6 files changed, 19 insertions(+), 48 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 6114c5577070b..56bdb2bb8f291 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -533,10 +533,6 @@ Changelog callables in :class:`neighbors.NearestNeighbors` is deprecated and will be removed in version 1.5. :pr:`24083` by :user:`Valentin Laurent `. -- |Enhancement| :class:`neighbors.KDTree` and :class:`neighbors.BallTree` - now support `numpy.float32`. - :pr:`25914` by :user:`Omar Salman `. - :mod:`sklearn.neural_network` ............................. diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index 6d7245cbb2317..cabfb9e220c0e 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -7,12 +7,6 @@ implementation_specific_values = [ # # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE # - # An empty string is used for the `name_suffix` of the float64 case - # and '32' bit is used for the `name_suffix` of the float32 case. - # This allows us to use `BinaryTree` conveniently. - # - # Note: we use the 64bit types as defined in `sklearn.utils._typedefs` - # ('64', 'float64_t', 'np.float64'), ('32', 'float32_t', 'np.float32') ] diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index 676ffead07145..311ddff58f202 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -7,13 +7,6 @@ implementation_specific_values = [ # # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE # - # An empty string is used for the `name_suffix` of the float64 case - # and '32' bit is used for the `name_suffix` of the float32 case. - # This allows us to use `DistanceMetric` conveniently and the default - # float64 case can be used without any particular modifications. - # - # Note: we use the 64bit types as defined in `sklearn.utils._typedefs` - # ('64', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'), ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT') ] @@ -22,6 +15,8 @@ implementation_specific_values = [ # ===================== # # Author: Jake Vanderplas , 2012-2013 +# Omar Salman +# # License: BSD # # The file generated is then literally included in ball_tree.pyx and kd_tree.pyx. @@ -2407,20 +2402,6 @@ cdef class BinaryTree{{name_suffix}}: ###################################################################### # Python functions for benchmarking and testing C implementations -{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} - -def load_heap{{name_suffix}}({{INPUT_DTYPE_t}}[:, ::1] X, intp_t k): - """test fully loading the heap""" - assert k <= X.shape[1] - cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(X.shape[0], k) - cdef intp_t i, j - for i in range(X.shape[0]): - for j in range(X.shape[1]): - heap._push(i, X[i, j], j) - return heap.get_arrays() - -{{endfor}} - def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): """In-place simultaneous sort the given row of the arrays @@ -2459,10 +2440,12 @@ def nodeheap_sort(float64_t[::1] vals): return np.asarray(vals_sorted), np.asarray(indices) -cdef inline float64_t _total_node_weight(NodeData_t* node_data, - const floating* sample_weight, - intp_t* idx_array, - intp_t i_node): +cdef inline float64_t _total_node_weight( + NodeData_t* node_data, + const floating* sample_weight, + intp_t* idx_array, + intp_t i_node, +): cdef intp_t i cdef float64_t N = 0.0 for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end): diff --git a/sklearn/neighbors/_kd_tree.pyx.tp b/sklearn/neighbors/_kd_tree.pyx.tp index b684219f6b9a7..861819a312f97 100644 --- a/sklearn/neighbors/_kd_tree.pyx.tp +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -7,12 +7,6 @@ implementation_specific_values = [ # # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE # - # An empty string is used for the `name_suffix` of the float64 case - # and '32' bit is used for the `name_suffix` of the float32 case. - # This allows us to use `BinaryTree` conveniently. - # - # Note: we use the 64bit types as defined in `sklearn.utils._typedefs` - # ('64', 'float64_t', 'np.float64'), ('32', 'float32_t', 'np.float32') ] diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index ab9153c05afae..e9dffa081e31d 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -116,6 +116,8 @@ def one_arg_func(x): @pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) def test_ball_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. X_64, X_32, Y_64, Y_32 = get_dataset_for_query_methods( random_seed=global_random_seed ) diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 1f4bbb3a5a6a6..692213be5b349 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -16,26 +16,26 @@ KD_TREE_CLASSES = [ KDTree64, - KDTree64, + KDTree32, ] -@pytest.mark.parametrize("kd_tree_class", KD_TREE_CLASSES) -def test_array_object_type(kd_tree_class): +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_array_object_type(BinarySearchTree): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - kd_tree_class(X) + BinarySearchTree(X) -@pytest.mark.parametrize("kd_tree_class", KD_TREE_CLASSES) -def test_kdtree_picklable_with_joblib(kd_tree_class): +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_kdtree_picklable_with_joblib(BinarySearchTree): """Make sure that KDTree queries work when joblib memmaps. Non-regression test for #21685 and #21228.""" rng = np.random.RandomState(0) X = rng.random_sample((10, 3)) - tree = kd_tree_class(X, leaf_size=2) + tree = BinarySearchTree(X, leaf_size=2) # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that # use to raise "ValueError: buffer source array is read-only" in a previous @@ -45,6 +45,8 @@ def test_kdtree_picklable_with_joblib(kd_tree_class): @pytest.mark.parametrize("metric", METRICS) def test_kd_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. X_64, X_32, Y_64, Y_32 = get_dataset_for_query_methods( random_seed=global_random_seed ) From 56dacf51a31c4fb0010bf0deb7c59c11fa8be1a9 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Sun, 25 Jun 2023 17:36:45 +0500 Subject: [PATCH 30/36] PR suggestions --- sklearn/neighbors/_binary_tree.pxi.tp | 134 ++++++++++++++-------- sklearn/neighbors/tests/test_ball_tree.py | 28 ++--- 2 files changed, 101 insertions(+), 61 deletions(-) diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index 311ddff58f202..eb8faf284ad9b 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -19,8 +19,8 @@ implementation_specific_values = [ # # License: BSD # -# The file generated is then literally included in ball_tree.pyx and kd_tree.pyx. -# See ball_tree.pyx.tp and kd_tree.pyx.tp. +# _binary_tree.pxi is generated and is then literally Cython included in +# ball_tree.pyx and kd_tree.pyx. See ball_tree.pyx.tp and kd_tree.pyx.tp. }} @@ -1614,10 +1614,14 @@ cdef class BinaryTree{{name_suffix}}: return count - cdef int _query_single_depthfirst(self, intp_t i_node, - {{INPUT_DTYPE_t}}* pt, intp_t i_pt, - NeighborsHeap{{name_suffix}} heap, - float64_t reduced_dist_LB) except -1 nogil: + cdef int _query_single_depthfirst( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1 nogil: """Recursive Single-tree k-neighbors query, depth-first approach""" cdef NodeData_t node_info = self.node_data[i_node] @@ -1665,10 +1669,13 @@ cdef class BinaryTree{{name_suffix}}: reduced_dist_LB_1) return 0 - cdef int _query_single_breadthfirst(self, {{INPUT_DTYPE_t}}* pt, - intp_t i_pt, - NeighborsHeap{{name_suffix}} heap, - NodeHeap nodeheap) except -1: + cdef int _query_single_breadthfirst( + self, + {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: """Non-recursive single-tree k-neighbors query, breadth-first search""" cdef intp_t i, i_node cdef float64_t dist_pt, reduced_dist_LB @@ -1714,11 +1721,15 @@ cdef class BinaryTree{{name_suffix}}: nodeheap.push(nodeheap_item) return 0 - cdef int _query_dual_depthfirst(self, intp_t i_node1, - BinaryTree{{name_suffix}} other, intp_t i_node2, - float64_t[::1] bounds, - NeighborsHeap{{name_suffix}} heap, - float64_t reduced_dist_LB) except -1: + cdef int _query_dual_depthfirst( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t[::1] bounds, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1: """Recursive dual-tree k-neighbors query, depth-first""" # note that the array `bounds` is maintained such that # bounds[i] is the largest distance among any of the @@ -1815,9 +1826,12 @@ cdef class BinaryTree{{name_suffix}}: bounds, heap, reduced_dist_LB1) return 0 - cdef int _query_dual_breadthfirst(self, BinaryTree{{name_suffix}} other, - NeighborsHeap{{name_suffix}} heap, - NodeHeap nodeheap) except -1: + cdef int _query_dual_breadthfirst( + self, + BinaryTree{{name_suffix}} other, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: """Non-recursive dual-tree k-neighbors query, breadth-first""" cdef intp_t i, i1, i2, i_node1, i_node2, i_pt cdef float64_t dist_pt, reduced_dist_LB @@ -1899,14 +1913,17 @@ cdef class BinaryTree{{name_suffix}}: nodeheap.push(nodeheap_item) return 0 - cdef intp_t _query_radius_single(self, - intp_t i_node, - {{INPUT_DTYPE_t}}* pt, float64_t r, - intp_t* indices, - {{INPUT_DTYPE_t}}* distances, - intp_t count, - int count_only, - int return_distance) noexcept nogil: + cdef intp_t _query_radius_single( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t r, + intp_t* indices, + {{INPUT_DTYPE_t}}* distances, + intp_t count, + int count_only, + int return_distance, + ) noexcept nogil: """recursive single-tree radius query, depth-first""" cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef intp_t* idx_array = &self.idx_array[0] @@ -1975,13 +1992,17 @@ cdef class BinaryTree{{name_suffix}}: return count - cdef float64_t _kde_single_breadthfirst(self, {{INPUT_DTYPE_t}}* pt, - KernelType kernel, float64_t h, - float64_t log_knorm, - float64_t log_atol, float64_t log_rtol, - NodeHeap nodeheap, - float64_t* node_log_min_bounds, - float64_t* node_log_bound_spreads): + cdef float64_t _kde_single_breadthfirst( + self, {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + NodeHeap nodeheap, + float64_t* node_log_min_bounds, + float64_t* node_log_bound_spreads, + ): """non-recursive single-tree kernel density estimation""" # For the given point, node_log_min_bounds and node_log_bound_spreads # will encode the current bounds on the density between the point @@ -2140,14 +2161,19 @@ cdef class BinaryTree{{name_suffix}}: global_log_bound_spread - log(2)) cdef int _kde_single_depthfirst( - self, intp_t i_node, {{INPUT_DTYPE_t}}* pt, - KernelType kernel, float64_t h, - float64_t log_knorm, - float64_t log_atol, float64_t log_rtol, - float64_t local_log_min_bound, - float64_t local_log_bound_spread, - float64_t* global_log_min_bound, - float64_t* global_log_bound_spread) except -1: + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + float64_t local_log_min_bound, + float64_t local_log_bound_spread, + float64_t* global_log_min_bound, + float64_t* global_log_bound_spread, + ) except -1: """recursive single-tree kernel density estimate, depth-first""" # For the given point, local_min_bound and local_max_bound give the # minimum and maximum density for the current node, while @@ -2276,9 +2302,15 @@ cdef class BinaryTree{{name_suffix}}: global_log_bound_spread) return 0 - cdef int _two_point_single(self, intp_t i_node, {{INPUT_DTYPE_t}}* pt, float64_t* r, - intp_t* count, intp_t i_min, - intp_t i_max) except -1: + cdef int _two_point_single( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: """recursive single-tree two-point correlation function query""" cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef intp_t* idx_array = &self.idx_array[0] @@ -2325,10 +2357,16 @@ cdef class BinaryTree{{name_suffix}}: count, i_min, i_max) return 0 - cdef int _two_point_dual(self, intp_t i_node1, - BinaryTree{{name_suffix}} other, intp_t i_node2, - float64_t* r, intp_t* count, - intp_t i_min, intp_t i_max) except -1: + cdef int _two_point_dual( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: """recursive dual-tree two-point correlation function query""" cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 3a4e59cd8133a..e55300dcaaa1d 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -57,8 +57,8 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): @pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) @pytest.mark.parametrize("array_type", ["list", "array"]) -@pytest.mark.parametrize("ball_tree_class", BALL_TREE_CLASSES) -def test_ball_tree_query_metrics(metric, array_type, ball_tree_class): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) @@ -71,17 +71,19 @@ def test_ball_tree_query_metrics(metric, array_type, ball_tree_class): k = 5 - bt = ball_tree_class(X, leaf_size=1, metric=metric) + bt = BallTreeImplementation(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2) -@pytest.mark.parametrize("ball_tree_class, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5])) -def test_query_haversine(ball_tree_class, decimal_tol): +@pytest.mark.parametrize( + "BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5]) +) +def test_query_haversine(BallTreeImplementation, decimal_tol): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) - bt = ball_tree_class(X, leaf_size=1, metric="haversine") + bt = BallTreeImplementation(X, leaf_size=1, metric="haversine") dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") @@ -89,16 +91,16 @@ def test_query_haversine(ball_tree_class, decimal_tol): assert_array_almost_equal(ind1, ind2) -@pytest.mark.parametrize("ball_tree_class", BALL_TREE_CLASSES) -def test_array_object_type(ball_tree_class): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_array_object_type(BallTreeImplementation): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - ball_tree_class(X) + BallTreeImplementation(X) -@pytest.mark.parametrize("ball_tree_class", BALL_TREE_CLASSES) -def test_bad_pyfunc_metric(ball_tree_class): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_bad_pyfunc_metric(BallTreeImplementation): def wrong_returned_value(x, y): return "1" @@ -108,11 +110,11 @@ def one_arg_func(x): X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): - ball_tree_class(X, metric=wrong_returned_value) + BallTreeImplementation(X, metric=wrong_returned_value) msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): - ball_tree_class(X, metric=one_arg_func) + BallTreeImplementation(X, metric=one_arg_func) @pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) From 7e713676f5508b5567bc5b0e7a079ad70a32616a Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 26 Jun 2023 10:46:48 +0500 Subject: [PATCH 31/36] PR suggestion: Use a public BallTree class that inherits from BallTree64 --- sklearn/neighbors/__init__.py | 4 ++-- sklearn/neighbors/_ball_tree.pyx.tp | 6 +++++- sklearn/neighbors/_base.py | 4 ++-- sklearn/neighbors/_kd_tree.pyx.tp | 6 +++++- sklearn/neighbors/_kde.py | 4 ++-- sklearn/neighbors/tests/test_ball_tree.py | 3 ++- sklearn/neighbors/tests/test_kd_tree.py | 3 ++- sklearn/neighbors/tests/test_neighbors_tree.py | 8 +++----- 8 files changed, 23 insertions(+), 15 deletions(-) diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 17025adce5213..ce697656b4c2e 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -3,7 +3,7 @@ algorithm. """ -from ._ball_tree import BallTree64 as BallTree +from ._ball_tree import BallTree from ._base import VALID_METRICS, VALID_METRICS_SPARSE, sort_graph_by_row_values from ._classification import KNeighborsClassifier, RadiusNeighborsClassifier from ._graph import ( @@ -12,7 +12,7 @@ kneighbors_graph, radius_neighbors_graph, ) -from ._kd_tree import KDTree64 as KDTree +from ._kd_tree import KDTree from ._kde import KernelDensity from ._lof import LocalOutlierFactor from ._nca import NeighborhoodComponentsAnalysis diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index cabfb9e220c0e..a89bebcd8d6fc 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -17,7 +17,7 @@ implementation_specific_values = [ }} -__all__ = ['BallTree64', 'BallTree32'] +__all__ = ['BallTree', 'BallTree64', 'BallTree32'] {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -277,3 +277,7 @@ cdef inline float64_t max_rdist_dual{{name_suffix}}( ) {{endfor}} + + +class BallTree(BallTree64): + pass diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index f61bc126d037e..5483aa31d1bcd 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -35,8 +35,8 @@ from ..utils.multiclass import check_classification_targets from ..utils.parallel import Parallel, delayed from ..utils.validation import check_is_fitted, check_non_negative -from ._ball_tree import BallTree64 as BallTree -from ._kd_tree import KDTree64 as KDTree +from ._ball_tree import BallTree +from ._kd_tree import KDTree SCIPY_METRICS = [ "braycurtis", diff --git a/sklearn/neighbors/_kd_tree.pyx.tp b/sklearn/neighbors/_kd_tree.pyx.tp index 861819a312f97..ef388b9d5826c 100644 --- a/sklearn/neighbors/_kd_tree.pyx.tp +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -18,7 +18,7 @@ implementation_specific_values = [ }} -__all__ = ['KDTree64', 'KDTree32'] +__all__ = ['KDTree', 'KDTree64', 'KDTree32'] {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -329,3 +329,7 @@ cdef inline float64_t max_dist_dual{{name_suffix}}( ) {{endfor}} + + +class KDTree(KDTree64): + pass diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index f7501ad357a01..ff1b48c672218 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -15,8 +15,8 @@ from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import row_norms from ..utils.validation import _check_sample_weight, check_is_fitted -from ._ball_tree import BallTree64 as BallTree -from ._kd_tree import KDTree64 as KDTree +from ._ball_tree import BallTree +from ._kd_tree import KDTree VALID_KERNELS = [ "gaussian", diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index e55300dcaaa1d..c8d5ec83efe5b 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -4,7 +4,7 @@ import pytest from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal -from sklearn.neighbors._ball_tree import BallTree32, BallTree64 +from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64 from sklearn.utils import check_random_state from sklearn.utils._testing import _convert_container from sklearn.utils.validation import check_array @@ -42,6 +42,7 @@ BALL_TREE_CLASSES = [ BallTree64, BallTree32, + BallTree, ] diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index fac164bbbc136..491a2b45a9a7d 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -2,7 +2,7 @@ import pytest from numpy.testing import assert_allclose, assert_equal -from sklearn.neighbors._kd_tree import KDTree32, KDTree64 +from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64 from sklearn.neighbors.tests.test_ball_tree import ( get_dataset_for_kernel_density, get_dataset_for_query_methods, @@ -16,6 +16,7 @@ KD_TREE_CLASSES = [ KDTree64, KDTree32, + KDTree, ] diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index ae216b3bacdb5..4d8bac12f7423 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -9,14 +9,12 @@ from sklearn.metrics import DistanceMetric from sklearn.neighbors._ball_tree import ( - BallTree64 as BallTree, + BallTree, + kernel_norm, ) from sklearn.neighbors._ball_tree import ( NeighborsHeap64 as NeighborsHeapBT, ) -from sklearn.neighbors._ball_tree import ( - kernel_norm, -) from sklearn.neighbors._ball_tree import ( nodeheap_sort as nodeheap_sort_bt, ) @@ -24,7 +22,7 @@ simultaneous_sort as simultaneous_sort_bt, ) from sklearn.neighbors._kd_tree import ( - KDTree64 as KDTree, + KDTree, ) from sklearn.neighbors._kd_tree import ( NeighborsHeap64 as NeighborsHeapKDT, From aa7cbe44e04d5f2c40956ce4a09556bd492ff186 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Wed, 5 Jul 2023 12:31:07 +0500 Subject: [PATCH 32/36] Applies PR suggestions --- sklearn/neighbors/tests/test_ball_tree.py | 62 +++++++---------------- sklearn/neighbors/tests/test_kd_tree.py | 37 ++++++-------- 2 files changed, 33 insertions(+), 66 deletions(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index c8d5ec83efe5b..b9441e04e1cfc 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -122,8 +122,8 @@ def one_arg_func(x): def test_ball_tree_numerical_consistency(global_random_seed, metric): # Results on float64 and float32 versions of a dataset must be # numerically close. - X_64, X_32, Y_64, Y_32 = get_dataset_for_query_methods( - random_seed=global_random_seed + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 ) metric_params = METRICS.get(metric, {}) @@ -141,30 +141,26 @@ def test_ball_tree_numerical_consistency(global_random_seed, metric): # Test consistency with respect to the `query_radius` method r = 2.38 - ind_64, neighbors_64 = bt_64.query_radius(Y_64[0:2, :], r=r) - ind_32, neighbors_32 = bt_32.query_radius(Y_32[0:2, :], r=r) - assert_equal(ind_64, ind_32) - assert_allclose( - neighbors_64, - neighbors_32, - ) + ind_64 = bt_64.query_radius(Y_64, r=r) + ind_32 = bt_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) # Test consistency with respect to the `query_radius` method # with return distances being true - ind_64, dist_64 = bt_64.query_radius(Y_64[4:5, :], r=r, return_distance=True) - ind_32, dist_32 = bt_32.query_radius(Y_32[4:5, :], r=r, return_distance=True) - assert_equal(ind_64[0], ind_32[0]) - assert_allclose(dist_64[0], dist_32[0], rtol=1e-5) - assert dist_64[0].dtype == np.float64 - assert dist_32[0].dtype == np.float32 + ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 @pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) def test_kernel_density_numerical_consistency(global_random_seed, metric): # Test consistency with respect to the `kernel_density` method - X_64, X_32, Y_64, Y_32 = get_dataset_for_kernel_density( - random_seed=global_random_seed - ) + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) metric_params = METRICS.get(metric, {}) bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) @@ -181,15 +177,7 @@ def test_kernel_density_numerical_consistency(global_random_seed, metric): def test_two_point_correlation_numerical_consistency(global_random_seed): # Test consistency with respect to the `two_point_correlation` method - rng = np.random.RandomState(global_random_seed) - _X = rng.random_sample((100, 3)) - _Y = rng.random_sample((5, 3)) - - X_64 = _X.astype(dtype=np.float64, copy=False) - Y_64 = _Y.astype(dtype=np.float64, copy=False) - - X_32 = _X.astype(dtype=np.float32, copy=False) - Y_32 = _Y.astype(dtype=np.float32, copy=False) + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) bt_64 = BallTree64(X_64, leaf_size=10) bt_32 = BallTree32(X_32, leaf_size=10) @@ -201,24 +189,10 @@ def test_two_point_correlation_numerical_consistency(global_random_seed): assert_allclose(counts_64, counts_32) -def get_dataset_for_query_methods(random_seed): - rng = np.random.RandomState(random_seed) - _X = rng.rand(100, 50) - _Y = rng.rand(5, 50) - - X_64 = _X.astype(dtype=np.float64, copy=False) - Y_64 = _Y.astype(dtype=np.float64, copy=False) - - X_32 = _X.astype(dtype=np.float32, copy=False) - Y_32 = _Y.astype(dtype=np.float32, copy=False) - - return X_64, X_32, Y_64, Y_32 - - -def get_dataset_for_kernel_density(random_seed): +def get_dataset_for_binary_tree(random_seed, features=3): rng = np.random.RandomState(random_seed) - _X = rng.random_sample((100, 3)) - _Y = rng.random_sample((5, 3)) + _X = rng.rand(100, features) + _Y = rng.rand(5, features) X_64 = _X.astype(dtype=np.float64, copy=False) Y_64 = _Y.astype(dtype=np.float64, copy=False) diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 491a2b45a9a7d..37db4ac34e356 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -3,10 +3,7 @@ from numpy.testing import assert_allclose, assert_equal from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64 -from sklearn.neighbors.tests.test_ball_tree import ( - get_dataset_for_kernel_density, - get_dataset_for_query_methods, -) +from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree from sklearn.utils.parallel import Parallel, delayed DIMENSION = 3 @@ -47,8 +44,8 @@ def test_kdtree_picklable_with_joblib(BinarySearchTree): def test_kd_tree_numerical_consistency(global_random_seed, metric): # Results on float64 and float32 versions of a dataset must be # numerically close. - X_64, X_32, Y_64, Y_32 = get_dataset_for_query_methods( - random_seed=global_random_seed + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 ) metric_params = METRICS.get(metric, {}) @@ -66,30 +63,26 @@ def test_kd_tree_numerical_consistency(global_random_seed, metric): # Test consistency with respect to the `query_radius` method r = 2.38 - ind_64, neighbors_64 = kd_64.query_radius(Y_64[0:2, :], r=r) - ind_32, neighbors_32 = kd_32.query_radius(Y_32[0:2, :], r=r) - assert_equal(ind_64, ind_32) - assert_allclose( - neighbors_64, - neighbors_32, - ) + ind_64 = kd_64.query_radius(Y_64, r=r) + ind_32 = kd_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) # Test consistency with respect to the `query_radius` method # with return distances being true - ind_64, dist_64 = kd_64.query_radius(Y_64[4:5, :], r=r, return_distance=True) - ind_32, dist_32 = kd_32.query_radius(Y_32[4:5, :], r=r, return_distance=True) - assert_equal(ind_64[0], ind_32[0]) - assert_allclose(dist_64[0], dist_32[0], rtol=1e-5) - assert dist_64[0].dtype == np.float64 - assert dist_32[0].dtype == np.float32 + ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 @pytest.mark.parametrize("metric", METRICS) def test_kernel_density_numerical_consistency(global_random_seed, metric): # Test consistency with respect to the `kernel_density` method - X_64, X_32, Y_64, Y_32 = get_dataset_for_kernel_density( - random_seed=global_random_seed - ) + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) metric_params = METRICS.get(metric, {}) kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) From 8a1433c866c7b7a174541441bc01c44147f185b1 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 7 Jul 2023 11:10:56 +0500 Subject: [PATCH 33/36] Remove additional metrics --- sklearn/neighbors/tests/test_ball_tree.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index b9441e04e1cfc..5e8af244c9661 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -22,12 +22,6 @@ "chebyshev": {}, } -ADDITIONAL_METRICS = { - "seuclidean": dict(V=rng.random_sample(DIMENSION)), - "wminkowski": dict(p=3, w=rng.random_sample(DIMENSION)), - "mahalanobis": dict(V=V_mahalanobis), -} - DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"] BOOLEAN_METRICS = [ From 0d1a58e88d610f5ff6e723d1210f312f4b7d5400 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 7 Jul 2023 16:30:03 +0500 Subject: [PATCH 34/36] Fix unintended error during merge with main --- sklearn/neighbors/_binary_tree.pxi.tp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index 85c26bc9b55f5..6322f809f7eb9 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -1010,8 +1010,8 @@ cdef class BinaryTree{{name_suffix}}: self.node_bounds.base, ) - cdef inline float64_t dist(self, float64_t* x1, float64_t* x2, - intp_t size) except -1 nogil: + cdef inline float64_t dist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: """Compute the distance between arrays x1 and x2""" self.n_calls += 1 if self.euclidean: From 179eb063887f8f7234eea57eb34b07a29c3f2192 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Sat, 29 Jul 2023 13:03:24 +0500 Subject: [PATCH 35/36] Address PR suggestions --- setup.py | 7 +++---- sklearn/neighbors/tests/test_ball_tree.py | 5 ++++- sklearn/neighbors/tests/test_kd_tree.py | 5 ++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index a280da8ebbc0a..f9ae13c94502b 100755 --- a/setup.py +++ b/setup.py @@ -500,9 +500,8 @@ def configure_extension_modules(): # `source` is a Tempita file tempita_sources.append(source) - # Do not include header files (".pxd") and include files - # (".pxi") that were generated by Tempita. - if os.path.splitext(new_source_path)[-1] not in (".pxd", ".pxi"): + # Only include source files that are pyx files + if os.path.splitext(new_source_path)[-1] == ".pyx": sources.append(new_source_path) gen_from_templates(tempita_sources) @@ -510,7 +509,7 @@ def configure_extension_modules(): # Do not progress if we only have a tempita file which we don't # want to include like the .pxi.tp extension. In such a case # sources would be empty. - if len(sources) == 0: + if not sources: continue # By convention, our extensions always use the name of the first source diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 5e8af244c9661..5263f201f320b 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -36,7 +36,6 @@ BALL_TREE_CLASSES = [ BallTree64, BallTree32, - BallTree, ] @@ -50,6 +49,10 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): return dist, ind +def test_BallTree_is_BallTree64_subclass(): + assert issubclass(BallTree, BallTree64) + + @pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) @pytest.mark.parametrize("array_type", ["list", "array"]) @pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 37db4ac34e356..749601baaf66f 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -13,10 +13,13 @@ KD_TREE_CLASSES = [ KDTree64, KDTree32, - KDTree, ] +def test_KDTree_is_KDTree64_subclass(): + assert issubclass(KDTree, KDTree64) + + @pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) def test_array_object_type(BinarySearchTree): """Check that we do not accept object dtype array.""" From 192dfc3b77e9ecc3bebdc486608a41ef9ff24dd5 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Tue, 1 Aug 2023 11:41:25 +0500 Subject: [PATCH 36/36] Add missing docs --- sklearn/neighbors/_ball_tree.pyx.tp | 1 + sklearn/neighbors/_kd_tree.pyx.tp | 1 + 2 files changed, 2 insertions(+) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp index a89bebcd8d6fc..92b26714e5d9f 100644 --- a/sklearn/neighbors/_ball_tree.pyx.tp +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -280,4 +280,5 @@ cdef inline float64_t max_rdist_dual{{name_suffix}}( class BallTree(BallTree64): + __doc__ = CLASS_DOC.format(BinaryTree="BallTree") pass diff --git a/sklearn/neighbors/_kd_tree.pyx.tp b/sklearn/neighbors/_kd_tree.pyx.tp index ef388b9d5826c..1006ec2a8398c 100644 --- a/sklearn/neighbors/_kd_tree.pyx.tp +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -332,4 +332,5 @@ cdef inline float64_t max_dist_dual{{name_suffix}}( class KDTree(KDTree64): + __doc__ = CLASS_DOC.format(BinaryTree="KDTree") pass