Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 4cc4a5d

Browse files
committed
ENH improve docstrings and test for radius_neighbors models
Make it more explicit that the order of the results of a call to radius_neighbors is undefined. FIX 4303
1 parent 6ce0b35 commit 4cc4a5d

File tree

3 files changed

+72
-41
lines changed

3 files changed

+72
-41
lines changed

sklearn/neighbors/approximate.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,11 @@ class LSHForest(BaseEstimator, KNeighborsMixin, RadiusNeighborsMixin):
117117
Random projection is used as the hash family which approximates
118118
cosine distance.
119119
120+
The cosine distance is defined as ``1 - cosine_similarity``: the lowest
121+
value is 0 (identical point) but it is bounded above by 2 for the farthest
122+
points. Its value does not depend on the norm of the vector points but
123+
only on their relative angles.
124+
120125
Parameters
121126
----------
122127
@@ -387,8 +392,7 @@ def _query(self, X):
387392
return bin_queries, np.max(depths, axis=0)
388393

389394
def kneighbors(self, X, n_neighbors=None, return_distance=True):
390-
"""
391-
Returns the n_number of approximated nearest neighbors
395+
"""Returns n_neighbors of approximate nearest neighbors.
392396
393397
Parameters
394398
----------
@@ -436,13 +440,22 @@ def kneighbors(self, X, n_neighbors=None, return_distance=True):
436440
return np.array(neighbors)
437441

438442
def radius_neighbors(self, X, radius=None, return_distance=True):
439-
"""
440-
Returns the approximated nearest neighbors within the radius
443+
"""Finds the neighbors within a given radius of a point or points.
444+
445+
Return the indices and distances of some points from the dataset
446+
lying in a ball with size ``radius`` around the points of the query
447+
array. Points lying on the boundary are included in the results.
448+
449+
The result points are *not* necessarily sorted by distance to their
450+
query point.
451+
452+
LSH Forest being an approximate method, some true neighbors from the
453+
indexed dataset might be missing from the results.
441454
442455
Parameters
443456
----------
444457
X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)
445-
List of n_features-dimensional data points. Each row
458+
List of n_features-dimensional data points. Each row
446459
corresponds to a single query.
447460
448461
radius : float
@@ -455,12 +468,13 @@ def radius_neighbors(self, X, radius=None, return_distance=True):
455468
Returns
456469
-------
457470
dist : array, shape (n_samples,) of arrays
458-
Array representing the cosine distances to each point,
459-
only present if return_distance=True.
471+
Each element is an array representing the cosine distances
472+
to some points found within ``radius`` of the respective query.
473+
Only present if ``return_distance=True``.
460474
461475
ind : array, shape (n_samples,) of arrays
462-
An array of arrays of indices of the approximated nearest points
463-
with in the `radius` to the query in the population matrix.
476+
Each element is an array of indices for neighbors within ``radius``
477+
of the respective query.
464478
"""
465479
if not hasattr(self, 'hash_functions_'):
466480
raise ValueError("estimator should be fitted.")

sklearn/neighbors/base.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ def _get_weights(dist, weights):
8686
if dist.dtype is np.dtype(object):
8787
for point_dist_i, point_dist in enumerate(dist):
8888
# check if point_dist is iterable
89-
# (ex: RadiusNeighborClassifier.predict may set an element of dist
90-
# to 1e-6 to represent an 'outlier')
89+
# (ex: RadiusNeighborClassifier.predict may set an element of
90+
# dist to 1e-6 to represent an 'outlier')
9191
if hasattr(point_dist, '__contains__') and 0. in point_dist:
9292
dist[point_dist_i] = point_dist == 0.
9393
else:
@@ -489,7 +489,12 @@ class RadiusNeighborsMixin(object):
489489
def radius_neighbors(self, X=None, radius=None, return_distance=True):
490490
"""Finds the neighbors within a given radius of a point or points.
491491
492-
Returns indices of and distances to the neighbors of each point.
492+
Return the indices and distances of each point from the dataset
493+
lying in a ball with size ``radius`` around the points of the query
494+
array. Points lying on the boundary are included in the results.
495+
496+
The result points are *not* necessarily sorted by distance to their
497+
query point.
493498
494499
Parameters
495500
----------
@@ -507,18 +512,21 @@ def radius_neighbors(self, X=None, radius=None, return_distance=True):
507512
508513
Returns
509514
-------
510-
dist : array
511-
Array representing the euclidean distances to each point,
512-
only present if return_distance=True.
515+
dist : array, shape (n_samples,) of arrays
516+
Array representing the distances to each point, only present if
517+
return_distance=True. The distance values are computed according
518+
to the ``metric`` constructor parameter.
513519
514-
ind : array
515-
Indices of the nearest points in the population matrix.
520+
ind : array, shape (n_samples,) of arrays
521+
An array of arrays of indices of the approximate nearest points
522+
from the population matrix that lie within a ball of size
523+
``radius`` around the query points.
516524
517525
Examples
518526
--------
519527
In the following example, we construct a NeighborsClassifier
520528
class from an array representing our data set and ask who's
521-
the closest point to [1,1,1]
529+
the closest point to [1, 1, 1]:
522530
523531
>>> import numpy as np
524532
>>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]

sklearn/neighbors/tests/test_approximate.py

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,20 @@ def test_radius_neighbors():
166166
lshf.fit(X)
167167

168168
for i in range(n_iter):
169+
# Select a random point in the dataset as the query
169170
query = X[rng.randint(0, n_samples)]
171+
172+
# At least one neighbor should be returned when the radius is the
173+
# mean distance from the query to the points of the dataset.
170174
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
171175
neighbors = lshf.radius_neighbors(query, radius=mean_dist,
172176
return_distance=False)
177+
173178
assert_equal(neighbors.shape, (1,))
174179
assert_equal(neighbors.dtype, object)
175180
assert_greater(neighbors[0].shape[0], 0)
176-
# All distances should be less than mean_dist
181+
# All distances to points in the results of the radius query should
182+
# be less than mean_dist
177183
distances, neighbors = lshf.radius_neighbors(query,
178184
radius=mean_dist,
179185
return_distance=True)
@@ -184,23 +190,33 @@ def test_radius_neighbors():
184190
queries = X[rng.randint(0, n_samples, n_queries)]
185191
distances, neighbors = lshf.radius_neighbors(queries,
186192
return_distance=True)
187-
assert_equal(neighbors.shape[0], n_queries)
188-
assert_equal(distances.shape[0], n_queries)
189-
# dists and inds should not be 2D arrays
190-
assert_equal(distances.ndim, 1)
191-
assert_equal(neighbors.ndim, 1)
193+
194+
# dists and inds should not be 1D arrays or arrays of variable lengths
195+
# hence the use of the object dtype.
196+
assert_equal(distances.shape, (n_queries,))
197+
assert_equal(distances.dtype, object)
198+
assert_equal(neighbors.shape, (n_queries,))
199+
assert_equal(neighbors.dtype, object)
192200

193201
# Compare with exact neighbor search
194202
query = X[rng.randint(0, n_samples)]
195203
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
196-
nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
197-
nbrs.fit(X)
204+
nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)
198205

199-
distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
200206
distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
201-
# Distances of exact neighbors is less than or equal to approximate
202-
assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
203-
np.sort(distances_approx[0]))))
207+
distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
208+
209+
# Radius-based queries do not sort the result points and the order
210+
# depends on the method, the random_state and the dataset order. Therefore
211+
# we need to sort the results ourselves before performing any comparison.
212+
sorted_dists_exact = np.sort(distances_exact[0])
213+
sorted_dists_approx = np.sort(distances_approx[0])
214+
215+
# Distances to exact neighbors are less than or equal to approximate
216+
# counterparts as the approximate radius query might have missed some
217+
# closer neighbors.
218+
assert_true(np.all(np.less_equal(sorted_dists_exact,
219+
sorted_dists_approx)))
204220

205221

206222
def test_distances():
@@ -220,15 +236,13 @@ def test_distances():
220236
distances, neighbors = lshf.kneighbors(query,
221237
n_neighbors=n_neighbors,
222238
return_distance=True)
223-
# Returned neighbors should be from closest to farthest.
224-
assert_true(np.all(np.diff(distances[0]) >= 0))
225239

226-
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
227-
distances, neighbors = lshf.radius_neighbors(query,
228-
radius=mean_dist,
229-
return_distance=True)
240+
# Returned neighbors should be from closest to farthest, that is
241+
# increasing distance values.
230242
assert_true(np.all(np.diff(distances[0]) >= 0))
231243

244+
# The radius_neighbors method does guarantee the order of the results.
245+
232246

233247
def test_fit():
234248
"""Checks whether `fit` method sets all attribute values correctly."""
@@ -407,8 +421,3 @@ def test_sparse_input():
407421
assert_array_equal(a, b)
408422
for a, b in zip(i_sparse, i_dense):
409423
assert_array_equal(a, b)
410-
411-
412-
if __name__ == "__main__":
413-
import nose
414-
nose.runmodule()

0 commit comments

Comments
 (0)