From e3024d536707baa5813dae7883817b91b91e9715 Mon Sep 17 00:00:00 2001
From: Erich Schubert <erich.schubert@gmail.com>
Date: Sat, 10 Jan 2015 19:40:28 +0100
Subject: [PATCH 1/2] Do not shuffle in DBSCAN (warn if `random_state` is
 used).

This makes little difference, and original DBSCAN did not shuffle.
Warn if `random_state` is used.

As is `random_state` encourages users to experiment with different
randomization, as you would do with k-means. But in contrast to
k-means, the output of DBSCAN is deterministic except for cluster
enumeration and "rare" cases, where a point is on the border of
two clusters at the same time. As this affects single points only,
the measureable performance difference will be close to zero.

Also, incorporate fix for minpts including the query point.
---
 sklearn/cluster/dbscan_.py | 55 +++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index 896cf0c20d350..10b5d66575426 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -8,11 +8,13 @@
 #
 # License: BSD 3 clause
 
+import warnings
+
 import numpy as np
 
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics import pairwise_distances
-from ..utils import check_random_state, check_array, check_consistent_length
+from ..utils import check_array, check_consistent_length
 from ..neighbors import NearestNeighbors
 
 
@@ -34,7 +36,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
 
     min_samples : int, optional
         The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point.
+        to be considered as a core point. This includes the point itself.
 
     metric : string, or callable
         The metric to use when calculating distance between instances in a
@@ -60,13 +62,14 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
         between points.
 
     sample_weight : array, shape (n_samples,), optional
-        Weight of each sample, such that a sample with weight greater
-        than ``min_samples`` is automatically a core sample; a sample with
-        negative weight may inhibit its eps-neighbor from being core.
+        Weight of each sample, such that a sample with a weight of at least
+        ``min_samples`` is by itself a core sample; a sample with negative
+        weight may inhibit its eps-neighbor from being core.
         Note that weights are absolute, and default to 1.
 
     random_state: numpy.RandomState, optional
-        The generator used to shuffle the samples. Defaults to numpy.random.
+        Ignored, will be removed in the next version.
+        (DBSCAN does not use random initialization).
 
     Returns
     -------
@@ -80,6 +83,10 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
     -----
     See examples/cluster/plot_dbscan.py for an example.
 
+    This implementation bulk-computes all neighborhood queries, which increases
+    the memory complexity to O(n.d) where d is the average number of neighbors,
+    while original DBSCAN had memory complexity O(n).
+
     References
     ----------
     Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
@@ -89,15 +96,16 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
     """
     if not eps > 0.0:
         raise ValueError("eps must be positive.")
+    if random_state is not None:
+        warnings.warn("The parameter random_state is ignored " + 
+                      "and will be removed in the next version. " +
+                      "(DBSCAN is deterministic except for rare border cases)")
 
     X = check_array(X, accept_sparse='csr')
     if sample_weight is not None:
         sample_weight = np.asarray(sample_weight)
         check_consistent_length(X, sample_weight)
 
-    # If index order not given, create random order.
-    random_state = check_random_state(random_state)
-
     # Calculate neighborhood for all samples. This leaves the original point
     # in, which needs to be considered later (i.e. point i is in the
     # neighborhood of point i. While True, its useless information)
@@ -109,6 +117,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
                                            leaf_size=leaf_size,
                                            metric=metric, p=p)
         neighbors_model.fit(X)
+        # This has worst case O(n^2) memory complexity
         neighborhoods = neighbors_model.radius_neighbors(X, eps,
                                                          return_distance=False)
         neighborhoods = np.array(neighborhoods)
@@ -122,15 +131,14 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
     labels = -np.ones(X.shape[0], dtype=np.int)
 
     # A list of all core samples found.
-    core_samples = np.flatnonzero(n_neighbors > min_samples)
-    index_order = core_samples[random_state.permutation(core_samples.shape[0])]
+    core_samples = np.flatnonzero(n_neighbors >= min_samples)
 
     # label_num is the label given to the new cluster
     label_num = 0
 
     # Look at all samples and determine if they are core.
     # If they are then build a new cluster from them.
-    for index in index_order:
+    for index in core_samples:
         # Already classified
         if labels[index] != -1:
             continue
@@ -170,7 +178,7 @@ class DBSCAN(BaseEstimator, ClusterMixin):
         as in the same neighborhood.
     min_samples : int, optional
         The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point.
+        to be considered as a core point. This includes the point itself.
     metric : string, or callable
         The metric to use when calculating distance between instances in a
         feature array. If metric is a string or callable, it must be one of
@@ -178,8 +186,6 @@ class DBSCAN(BaseEstimator, ClusterMixin):
         metric parameter.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square.
-    random_state : numpy.RandomState, optional
-        The generator used to shuffle the samples. Defaults to numpy.random.
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
         The algorithm to be used by the NearestNeighbors module
         to compute pointwise distances and find nearest neighbors.
@@ -189,6 +195,9 @@ class DBSCAN(BaseEstimator, ClusterMixin):
         of the construction and query, as well as the memory required
         to store the tree. The optimal value depends
         on the nature of the problem.
+    random_state: numpy.RandomState, optional
+        Ignored, will be removed in the next version.
+        (DBSCAN does not use random initialization).
 
     Attributes
     ----------
@@ -206,6 +215,10 @@ class DBSCAN(BaseEstimator, ClusterMixin):
     -----
     See examples/cluster/plot_dbscan.py for an example.
 
+    This implementation bulk-computes all neighborhood queries, which increases
+    the memory complexity to O(n.d) where d is the average number of neighbors,
+    while original DBSCAN had memory complexity O(n).
+
     References
     ----------
     Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
@@ -234,9 +247,9 @@ def fit(self, X, y=None, sample_weight=None):
             A feature array, or array of distances between samples if
             ``metric='precomputed'``.
         sample_weight : array, shape (n_samples,), optional
-            Weight of each sample, such that a sample with weight greater
-            than ``min_samples`` is automatically a core sample; a sample with
-            negative weight may inhibit its eps-neighbor from being core.
+            Weight of each sample, such that a sample with a weight of at least
+            ``min_samples`` is by itself a core sample; a sample with negative
+            weight may inhibit its eps-neighbor from being core.
             Note that weights are absolute, and default to 1.
         """
         X = check_array(X, accept_sparse='csr')
@@ -260,9 +273,9 @@ def fit_predict(self, X, y=None, sample_weight=None):
             A feature array, or array of distances between samples if
             ``metric='precomputed'``.
         sample_weight : array, shape (n_samples,), optional
-            Weight of each sample, such that a sample with weight greater
-            than ``min_samples`` is automatically a core sample; a sample with
-            negative weight may inhibit its eps-neighbor from being core.
+            Weight of each sample, such that a sample with a weight of at least
+            ``min_samples`` is by itself a core sample; a sample with negative
+            weight may inhibit its eps-neighbor from being core.
             Note that weights are absolute, and default to 1.
 
         Returns

From aa5a01da0d1737938089175232c3030242648632 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 11 Jan 2015 01:51:29 +1100
Subject: [PATCH 2/2] FIX/TST boundary cases in dbscan (closes #4073)

---
 sklearn/cluster/tests/test_dbscan.py | 35 +++++++++++++++++++---------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py
index cb2c449b44788..4fd7af3bc4110 100644
--- a/sklearn/cluster/tests/test_dbscan.py
+++ b/sklearn/cluster/tests/test_dbscan.py
@@ -12,6 +12,8 @@
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_in
+from sklearn.utils.testing import assert_not_in
 from sklearn.cluster.dbscan_ import DBSCAN
 from sklearn.cluster.dbscan_ import dbscan
 from sklearn.cluster.tests.common import generate_clustered_data
@@ -83,7 +85,7 @@ def test_dbscan_no_core_samples():
     X[X < .8] = 0
 
     for X_ in [X, sparse.csr_matrix(X)]:
-        db = DBSCAN().fit(X_)
+        db = DBSCAN(min_samples=6).fit(X_)
         assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
         assert_array_equal(db.labels_, -1)
         assert_equal(db.core_sample_indices_.shape, (0,))
@@ -185,6 +187,17 @@ def test_pickle():
     assert_equal(type(pickle.loads(s)), obj.__class__)
 
 
+def test_boundaries():
+    # ensure min_samples is inclusive of core point
+    core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
+    assert_in(0, core)
+    # ensure eps is inclusive of circumference
+    core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
+    assert_in(0, core)
+    core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
+    assert_not_in(0, core)
+
+
 def test_weighted_dbscan():
     # ensure sample_weight is validated
     assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2])
@@ -192,26 +205,26 @@ def test_weighted_dbscan():
 
     # ensure sample_weight has an effect
     assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
-                                  min_samples=5)[0])
+                                  min_samples=6)[0])
     assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
-                                  min_samples=5)[0])
+                                  min_samples=6)[0])
     assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
-                                   min_samples=5)[0])
+                                   min_samples=6)[0])
     assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
-                                      min_samples=5)[0])
+                                      min_samples=6)[0])
 
     # points within eps of each other:
     assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
-                                      sample_weight=[5, 1], min_samples=5)[0])
+                                      sample_weight=[5, 1], min_samples=6)[0])
     # and effect of non-positive and non-integer sample_weight:
     assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
-                                  eps=1.5, min_samples=5)[0])
-    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5, 0.1],
-                                      eps=1.5, min_samples=5)[0])
+                                  eps=1.5, min_samples=6)[0])
+    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1],
+                                      eps=1.5, min_samples=6)[0])
     assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
-                                      eps=1.5, min_samples=5)[0])
+                                      eps=1.5, min_samples=6)[0])
     assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
-                                  eps=1.5, min_samples=5)[0])
+                                  eps=1.5, min_samples=6)[0])
 
     # for non-negative sample_weight, cores should be identical to repetition
     rng = np.random.RandomState(42)