diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a6c955b5afcdc..7459e90194a3e 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -51,6 +51,12 @@ Changelog are always consistent with `scipy.spatial.distance.cdist`. :pr:`21741` by :user:`Olivier Grisel `. +:mod:`sklearn.neighbors` +........................ + +- |Fix| :class:`neighbors.KDTree` and :class:`neighbors.BallTree` correctly supports + read-only buffer attributes. :pr:`21845` by `Thomas Fan`_. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py index 468488ac276ea..911914de1bdff 100644 --- a/sklearn/_min_dependencies.py +++ b/sklearn/_min_dependencies.py @@ -20,7 +20,7 @@ JOBLIB_MIN_VERSION = "0.11" THREADPOOLCTL_MIN_VERSION = "2.0.0" PYTEST_MIN_VERSION = "5.0.1" -CYTHON_MIN_VERSION = "0.28.5" +CYTHON_MIN_VERSION = "0.29.24" # 'build' and 'install' is included to have structured metadata for CI. diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx index b5ac18365631a..298c9c995c9c6 100644 --- a/sklearn/neighbors/_ball_tree.pyx +++ b/sklearn/neighbors/_ball_tree.pyx @@ -43,7 +43,7 @@ cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes, return 0 -cdef int init_node(BinaryTree tree, ITYPE_t i_node, +cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, ITYPE_t i_node, ITYPE_t idx_start, ITYPE_t idx_end) except -1: """Initialize the node for the dataset stored in tree.data""" cdef ITYPE_t n_features = tree.data.shape[1] @@ -94,9 +94,9 @@ cdef int init_node(BinaryTree tree, ITYPE_t i_node, data + n_features * idx_array[i], n_features)) - tree.node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) - tree.node_data[i_node].idx_start = idx_start - tree.node_data[i_node].idx_end = idx_end + node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) + node_data[i_node].idx_start = idx_start + node_data[i_node].idx_end = idx_end return 0 diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index f25da86e2148c..6542bc680c58c 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -900,9 +900,13 @@ cdef class BinaryTree: cdef readonly const DTYPE_t[:, ::1] data cdef readonly const DTYPE_t[::1] sample_weight cdef public DTYPE_t sum_weight - cdef public ITYPE_t[::1] idx_array - cdef public NodeData_t[::1] node_data - cdef public DTYPE_t[:, :, ::1] node_bounds + + # Even if those memoryviews attributes are const-qualified, + # they get modified via their numpy counterpart. + # For instance, `node_data` gets modified via `node_data_arr`. + cdef public const ITYPE_t[::1] idx_array + cdef public const NodeData_t[::1] node_data + cdef public const DTYPE_t[:, :, ::1] node_bounds cdef ITYPE_t leaf_size cdef ITYPE_t n_levels @@ -986,7 +990,12 @@ cdef class BinaryTree: # Allocate tree-specific data allocate_data(self, self.n_nodes, n_features) - self._recursive_build(0, 0, n_samples) + self._recursive_build( + node_data=self.node_data_arr, + i_node=0, + idx_start=0, + idx_end=n_samples + ) def _update_sample_weight(self, n_samples, sample_weight): if sample_weight is not None: @@ -1133,7 +1142,7 @@ cdef class BinaryTree: else: return self.dist_metric.rdist(x1, x2, size) - cdef int _recursive_build(self, ITYPE_t i_node, ITYPE_t idx_start, + cdef int _recursive_build(self, NodeData_t[::1] node_data, ITYPE_t i_node, ITYPE_t idx_start, ITYPE_t idx_end) except -1: """Recursively build the tree. @@ -1153,10 +1162,10 @@ cdef class BinaryTree: cdef DTYPE_t* data = &self.data[0, 0] # initialize node data - init_node(self, i_node, idx_start, idx_end) + init_node(self, node_data, i_node, idx_start, idx_end) if 2 * i_node + 1 >= self.n_nodes: - self.node_data[i_node].is_leaf = True + node_data[i_node].is_leaf = True if idx_end - idx_start > 2 * self.leaf_size: # this shouldn't happen if our memory allocation is correct # we'll proactively prevent memory errors, but raise a @@ -1171,18 +1180,18 @@ cdef class BinaryTree: import warnings warnings.warn("Internal: memory layout is flawed: " "too many nodes allocated") - self.node_data[i_node].is_leaf = True + node_data[i_node].is_leaf = True else: # split node and recursively construct child nodes. - self.node_data[i_node].is_leaf = False + node_data[i_node].is_leaf = False i_max = find_node_split_dim(data, idx_array, n_features, n_points) partition_node_indices(data, idx_array, i_max, n_mid, n_features, n_points) - self._recursive_build(2 * i_node + 1, + self._recursive_build(node_data,2 * i_node + 1, idx_start, idx_start + n_mid) - self._recursive_build(2 * i_node + 2, + self._recursive_build(node_data, 2 * i_node + 2, idx_start + n_mid, idx_end) def query(self, X, k=1, return_distance=True, diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx index 59199c41f2e85..d61289e369e4a 100644 --- a/sklearn/neighbors/_kd_tree.pyx +++ b/sklearn/neighbors/_kd_tree.pyx @@ -37,7 +37,7 @@ cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes, return 0 -cdef int init_node(BinaryTree tree, ITYPE_t i_node, +cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, ITYPE_t i_node, ITYPE_t idx_start, ITYPE_t idx_end) except -1: """Initialize the node for the dataset stored in tree.data""" cdef ITYPE_t n_features = tree.data.shape[1] @@ -72,13 +72,13 @@ cdef int init_node(BinaryTree tree, ITYPE_t i_node, rad += pow(0.5 * abs(upper_bounds[j] - lower_bounds[j]), tree.dist_metric.p) - tree.node_data[i_node].idx_start = idx_start - tree.node_data[i_node].idx_end = idx_end + node_data[i_node].idx_start = idx_start + node_data[i_node].idx_end = idx_end # The radius will hold the size of the circumscribed hypersphere measured # with the specified metric: in querying, this is used as a measure of the # size of each node when deciding which nodes to split. - tree.node_data[i_node].radius = pow(rad, 1. / tree.dist_metric.p) + node_data[i_node].radius = pow(rad, 1. / tree.dist_metric.p) return 0 diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 64e37a6363274..d8d9437636d1d 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -1,5 +1,7 @@ import numpy as np import pytest +from joblib import Parallel +from sklearn.utils.fixes import delayed from sklearn.neighbors._kd_tree import KDTree @@ -13,3 +15,17 @@ def test_array_object_type(): X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): KDTree(X) + + +def test_kdtree_picklable_with_joblib(): + """Make sure that KDTree queries work when joblib memmaps. + + Non-regression test for #21685 and #21228.""" + rng = np.random.RandomState(0) + X = rng.random_sample((10, 3)) + tree = KDTree(X, leaf_size=2) + + # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that + # use to raise "ValueError: buffer source array is read-only" in a previous + # version of the Cython code. + Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X])