ENH improve docstrings and test for radius_neighbors models

ogrisel · ogrisel · commit 4cc4a5df5eb2 · 2015-03-05T16:15:46.000+01:00
Make it more explicit that the order of the results of a call to
radius_neighbors is undefined.

FIX 4303
diff --git a/sklearn/neighbors/approximate.py b/sklearn/neighbors/approximate.py
@@ -117,6 +117,11 @@ class LSHForest(BaseEstimator, KNeighborsMixin, RadiusNeighborsMixin):
     Random projection is used as the hash family which approximates
     cosine distance.
 
+    The cosine distance is defined as ``1 - cosine_similarity``: the lowest
+    value is 0 (identical point) but it is bounded above by 2 for the farthest
+    points. Its value does not depend on the norm of the vector points but
+    only on their relative angles.
+
     Parameters
     ----------
 
@@ -387,8 +392,7 @@ def _query(self, X):
         return bin_queries, np.max(depths, axis=0)
 
     def kneighbors(self, X, n_neighbors=None, return_distance=True):
-        """
-        Returns the n_number of approximated nearest neighbors
+        """Returns n_neighbors of approximate nearest neighbors.
 
         Parameters
         ----------
@@ -436,13 +440,22 @@ def kneighbors(self, X, n_neighbors=None, return_distance=True):
             return np.array(neighbors)
 
     def radius_neighbors(self, X, radius=None, return_distance=True):
-        """
-        Returns the approximated nearest neighbors within the radius
+        """Finds the neighbors within a given radius of a point or points.
+
+        Return the indices and distances of some points from the dataset
+        lying in a ball with size ``radius`` around the points of the query
+        array. Points lying on the boundary are included in the results.
+
+        The result points are *not* necessarily sorted by distance to their
+        query point.
+
+        LSH Forest being an approximate method, some true neighbors from the
+        indexed dataset might be missing from the results.
 
         Parameters
         ----------
         X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)
-            List of n_features-dimensional data points.  Each row
+            List of n_features-dimensional data points. Each row
             corresponds to a single query.
 
         radius : float
@@ -455,12 +468,13 @@ def radius_neighbors(self, X, radius=None, return_distance=True):
         Returns
         -------
         dist : array, shape (n_samples,) of arrays
-            Array representing the cosine distances to each point,
-            only present if return_distance=True.
+            Each element is an array representing the cosine distances
+            to some points found within ``radius`` of the respective query.
+            Only present if ``return_distance=True``.
 
         ind : array, shape (n_samples,) of arrays
-            An array of arrays of indices of the approximated nearest points
-            with in the `radius` to the query in the population matrix.
+            Each element is an array of indices for neighbors within ``radius``
+            of the respective query.
         """
         if not hasattr(self, 'hash_functions_'):
             raise ValueError("estimator should be fitted.")
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
@@ -86,8 +86,8 @@ def _get_weights(dist, weights):
         if dist.dtype is np.dtype(object):
             for point_dist_i, point_dist in enumerate(dist):
                 # check if point_dist is iterable
-                # (ex: RadiusNeighborClassifier.predict may set an element of dist
-                # to 1e-6 to represent an 'outlier')
+                # (ex: RadiusNeighborClassifier.predict may set an element of
+                # dist to 1e-6 to represent an 'outlier')
                 if hasattr(point_dist, '__contains__') and 0. in point_dist:
                     dist[point_dist_i] = point_dist == 0.
                 else:
@@ -489,7 +489,12 @@ class RadiusNeighborsMixin(object):
     def radius_neighbors(self, X=None, radius=None, return_distance=True):
         """Finds the neighbors within a given radius of a point or points.
 
-        Returns indices of and distances to the neighbors of each point.
+        Return the indices and distances of each point from the dataset
+        lying in a ball with size ``radius`` around the points of the query
+        array. Points lying on the boundary are included in the results.
+
+        The result points are *not* necessarily sorted by distance to their
+        query point.
 
         Parameters
         ----------
@@ -507,18 +512,21 @@ def radius_neighbors(self, X=None, radius=None, return_distance=True):
 
         Returns
         -------
-        dist : array
-            Array representing the euclidean distances to each point,
-            only present if return_distance=True.
+        dist : array, shape (n_samples,) of arrays
+            Array representing the distances to each point, only present if
+            return_distance=True. The distance values are computed according
+            to the ``metric`` constructor parameter.
 
-        ind : array
-            Indices of the nearest points in the population matrix.
+        ind : array, shape (n_samples,) of arrays
+            An array of arrays of indices of the approximate nearest points
+            from the population matrix that lie within a ball of size
+            ``radius`` around the query points.
 
         Examples
         --------
         In the following example, we construct a NeighborsClassifier
         class from an array representing our data set and ask who's
-        the closest point to [1,1,1]
+        the closest point to [1, 1, 1]:
 
         >>> import numpy as np
         >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
diff --git a/sklearn/neighbors/tests/test_approximate.py b/sklearn/neighbors/tests/test_approximate.py
@@ -166,14 +166,20 @@ def test_radius_neighbors():
     lshf.fit(X)
 
     for i in range(n_iter):
+        # Select a random point in the dataset as the query
         query = X[rng.randint(0, n_samples)]
+
+        # At least one neighbor should be returned when the radius is the
+        # mean distance from the query to the points of the dataset.
         mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
         neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                           return_distance=False)
+
         assert_equal(neighbors.shape, (1,))
         assert_equal(neighbors.dtype, object)
         assert_greater(neighbors[0].shape[0], 0)
-        # All distances should be less than mean_dist
+        # All distances to points in the results of the radius query should
+        # be less than mean_dist
         distances, neighbors = lshf.radius_neighbors(query,
                                                      radius=mean_dist,
                                                      return_distance=True)
@@ -184,23 +190,33 @@ def test_radius_neighbors():
     queries = X[rng.randint(0, n_samples, n_queries)]
     distances, neighbors = lshf.radius_neighbors(queries,
                                                  return_distance=True)
-    assert_equal(neighbors.shape[0], n_queries)
-    assert_equal(distances.shape[0], n_queries)
-    # dists and inds should not be 2D arrays
-    assert_equal(distances.ndim, 1)
-    assert_equal(neighbors.ndim, 1)
+
+    # dists and inds should not be 1D arrays or arrays of variable lengths
+    # hence the use of the object dtype.
+    assert_equal(distances.shape, (n_queries,))
+    assert_equal(distances.dtype, object)
+    assert_equal(neighbors.shape, (n_queries,))
+    assert_equal(neighbors.dtype, object)
 
     # Compare with exact neighbor search
     query = X[rng.randint(0, n_samples)]
     mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
-    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
-    nbrs.fit(X)
+    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)
 
-    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
     distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
-    # Distances of exact neighbors is less than or equal to approximate
-    assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
-                                     np.sort(distances_approx[0]))))
+    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
+
+    # Radius-based queries do not sort the result points and the order
+    # depends on the method, the random_state and the dataset order. Therefore
+    # we need to sort the results ourselves before performing any comparison.
+    sorted_dists_exact = np.sort(distances_exact[0])
+    sorted_dists_approx = np.sort(distances_approx[0])
+
+    # Distances to exact neighbors are less than or equal to approximate
+    # counterparts as the approximate radius query might have missed some
+    # closer neighbors.
+    assert_true(np.all(np.less_equal(sorted_dists_exact,
+                                     sorted_dists_approx)))
 
 
 def test_distances():
@@ -220,15 +236,13 @@ def test_distances():
         distances, neighbors = lshf.kneighbors(query,
                                                n_neighbors=n_neighbors,
                                                return_distance=True)
-        # Returned neighbors should be from closest to farthest.
-        assert_true(np.all(np.diff(distances[0]) >= 0))
 
-        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
-        distances, neighbors = lshf.radius_neighbors(query,
-                                                     radius=mean_dist,
-                                                     return_distance=True)
+        # Returned neighbors should be from closest to farthest, that is
+        # increasing distance values.
         assert_true(np.all(np.diff(distances[0]) >= 0))
 
+        # The radius_neighbors method does guarantee the order of the results.
+
 
 def test_fit():
     """Checks whether `fit` method sets all attribute values correctly."""
@@ -407,8 +421,3 @@ def test_sparse_input():
         assert_array_equal(a, b)
     for a, b in zip(i_sparse, i_dense):
         assert_array_equal(a, b)
-
-
-if __name__ == "__main__":
-    import nose
-    nose.runmodule()