scikit-learn · kno10 · Jan 10, 2015 · Jan 10, 2015 · amueller · Jan 16, 2015
diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
@@ -8,11 +8,13 @@
 #
 # License: BSD 3 clause
 
+import warnings
+
 import numpy as np
 
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics import pairwise_distances
-from ..utils import check_random_state, check_array, check_consistent_length
+from ..utils import check_array, check_consistent_length
 from ..neighbors import NearestNeighbors
 
 
@@ -34,7 +36,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
 
     min_samples : int, optional
         The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point.
+        to be considered as a core point. This includes the point itself.
 
     metric : string, or callable
         The metric to use when calculating distance between instances in a
@@ -60,13 +62,14 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
         between points.
 
     sample_weight : array, shape (n_samples,), optional
-        Weight of each sample, such that a sample with weight greater
-        than ``min_samples`` is automatically a core sample; a sample with
-        negative weight may inhibit its eps-neighbor from being core.
+        Weight of each sample, such that a sample with a weight of at least
+        ``min_samples`` is by itself a core sample; a sample with negative
+        weight may inhibit its eps-neighbor from being core.
         Note that weights are absolute, and default to 1.
 
     random_state: numpy.RandomState, optional
-        The generator used to shuffle the samples. Defaults to numpy.random.
+        Ignored, will be removed in the next version.
+        (DBSCAN does not use random initialization).
 
     Returns
     -------
@@ -80,6 +83,10 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
     -----
     See examples/cluster/plot_dbscan.py for an example.
 
+    This implementation bulk-computes all neighborhood queries, which increases
+    the memory complexity to O(n.d) where d is the average number of neighbors,
+    while original DBSCAN had memory complexity O(n).
+
     References
     ----------
     Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
@@ -89,15 +96,16 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
     """
     if not eps > 0.0:
         raise ValueError("eps must be positive.")
+    if random_state is not None:
+        warnings.warn("The parameter random_state is ignored " + 
+                      "and will be removed in the next version. " +
+                      "(DBSCAN is deterministic except for rare border cases)")
 
     X = check_array(X, accept_sparse='csr')
     if sample_weight is not None:
         sample_weight = np.asarray(sample_weight)
         check_consistent_length(X, sample_weight)
 
-    # If index order not given, create random order.
-    random_state = check_random_state(random_state)
-
     # Calculate neighborhood for all samples. This leaves the original point
     # in, which needs to be considered later (i.e. point i is in the
     # neighborhood of point i. While True, its useless information)
@@ -109,6 +117,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
                                            leaf_size=leaf_size,
                                            metric=metric, p=p)
         neighbors_model.fit(X)
+        # This has worst case O(n^2) memory complexity
         neighborhoods = neighbors_model.radius_neighbors(X, eps,
                                                          return_distance=False)
         neighborhoods = np.array(neighborhoods)
@@ -122,15 +131,14 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
     labels = -np.ones(X.shape[0], dtype=np.int)
 
     # A list of all core samples found.
-    core_samples = np.flatnonzero(n_neighbors > min_samples)
-    index_order = core_samples[random_state.permutation(core_samples.shape[0])]
+    core_samples = np.flatnonzero(n_neighbors >= min_samples)
 
     # label_num is the label given to the new cluster
     label_num = 0
 
     # Look at all samples and determine if they are core.
     # If they are then build a new cluster from them.
-    for index in index_order:
+    for index in core_samples:
         # Already classified
         if labels[index] != -1:
             continue
@@ -170,16 +178,14 @@ class DBSCAN(BaseEstimator, ClusterMixin):
         as in the same neighborhood.
     min_samples : int, optional
         The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point.
+        to be considered as a core point. This includes the point itself.
     metric : string, or callable
         The metric to use when calculating distance between instances in a
         feature array. If metric is a string or callable, it must be one of
         the options allowed by metrics.pairwise.calculate_distance for its
         metric parameter.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square.
-    random_state : numpy.RandomState, optional
-        The generator used to shuffle the samples. Defaults to numpy.random.
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
         The algorithm to be used by the NearestNeighbors module
         to compute pointwise distances and find nearest neighbors.
@@ -189,6 +195,9 @@ class DBSCAN(BaseEstimator, ClusterMixin):
         of the construction and query, as well as the memory required
         to store the tree. The optimal value depends
         on the nature of the problem.
+    random_state: numpy.RandomState, optional
+        Ignored, will be removed in the next version.
+        (DBSCAN does not use random initialization).
 
     Attributes
     ----------
@@ -206,6 +215,10 @@ class DBSCAN(BaseEstimator, ClusterMixin):
     -----
     See examples/cluster/plot_dbscan.py for an example.
 
+    This implementation bulk-computes all neighborhood queries, which increases
+    the memory complexity to O(n.d) where d is the average number of neighbors,
+    while original DBSCAN had memory complexity O(n).
+
     References
     ----------
     Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
@@ -234,9 +247,9 @@ def fit(self, X, y=None, sample_weight=None):
             A feature array, or array of distances between samples if
             ``metric='precomputed'``.
         sample_weight : array, shape (n_samples,), optional
-            Weight of each sample, such that a sample with weight greater
-            than ``min_samples`` is automatically a core sample; a sample with
-            negative weight may inhibit its eps-neighbor from being core.
+            Weight of each sample, such that a sample with a weight of at least
+            ``min_samples`` is by itself a core sample; a sample with negative
+            weight may inhibit its eps-neighbor from being core.
             Note that weights are absolute, and default to 1.
         """
         X = check_array(X, accept_sparse='csr')
@@ -260,9 +273,9 @@ def fit_predict(self, X, y=None, sample_weight=None):
             A feature array, or array of distances between samples if
             ``metric='precomputed'``.
         sample_weight : array, shape (n_samples,), optional
-            Weight of each sample, such that a sample with weight greater
-            than ``min_samples`` is automatically a core sample; a sample with
-            negative weight may inhibit its eps-neighbor from being core.
+            Weight of each sample, such that a sample with a weight of at least
+            ``min_samples`` is by itself a core sample; a sample with negative
+            weight may inhibit its eps-neighbor from being core.
             Note that weights are absolute, and default to 1.
 
         Returns

diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py
@@ -12,6 +12,8 @@
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_in
+from sklearn.utils.testing import assert_not_in
 from sklearn.cluster.dbscan_ import DBSCAN
 from sklearn.cluster.dbscan_ import dbscan
 from sklearn.cluster.tests.common import generate_clustered_data
@@ -83,7 +85,7 @@ def test_dbscan_no_core_samples():
     X[X < .8] = 0
 
     for X_ in [X, sparse.csr_matrix(X)]:
-        db = DBSCAN().fit(X_)
+        db = DBSCAN(min_samples=6).fit(X_)
         assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
         assert_array_equal(db.labels_, -1)
         assert_equal(db.core_sample_indices_.shape, (0,))
@@ -185,33 +187,44 @@ def test_pickle():
     assert_equal(type(pickle.loads(s)), obj.__class__)
 
 
+def test_boundaries():
+    # ensure min_samples is inclusive of core point
+    core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
+    assert_in(0, core)
+    # ensure eps is inclusive of circumference
+    core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
+    assert_in(0, core)
+    core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
+    assert_not_in(0, core)
+
+
 def test_weighted_dbscan():
     # ensure sample_weight is validated
     assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2])
     assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4])
 
     # ensure sample_weight has an effect
     assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
-                                  min_samples=5)[0])
+                                  min_samples=6)[0])
     assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
-                                  min_samples=5)[0])
+                                  min_samples=6)[0])
     assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
-                                   min_samples=5)[0])
+                                   min_samples=6)[0])
     assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
-                                      min_samples=5)[0])
+                                      min_samples=6)[0])
 
     # points within eps of each other:
     assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
-                                      sample_weight=[5, 1], min_samples=5)[0])
+                                      sample_weight=[5, 1], min_samples=6)[0])
     # and effect of non-positive and non-integer sample_weight:
     assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
-                                  eps=1.5, min_samples=5)[0])
-    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5, 0.1],
-                                      eps=1.5, min_samples=5)[0])
+                                  eps=1.5, min_samples=6)[0])
+    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1],
+                                      eps=1.5, min_samples=6)[0])
     assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
-                                      eps=1.5, min_samples=5)[0])
+                                      eps=1.5, min_samples=6)[0])
     assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
-                                  eps=1.5, min_samples=5)[0])
+                                  eps=1.5, min_samples=6)[0])
 
     # for non-negative sample_weight, cores should be identical to repetition
     rng = np.random.RandomState(42)