From cdb05778797c73020b347f509c9f7208e1d391f1 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Sun, 11 Jan 2015 01:51:29 +1100
Subject: [PATCH 1/2] FIX/TST boundary cases in dbscan

---
 sklearn/cluster/dbscan_.py           | 16 ++++++++------
 sklearn/cluster/tests/test_dbscan.py | 33 +++++++++++++++++++---------
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index 896cf0c20d350..3e3049790eb54 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -34,7 +34,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
 
     min_samples : int, optional
         The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point.
+        to be considered as a core point. This number is inclusive of the
+        core point.
 
     metric : string, or callable
         The metric to use when calculating distance between instances in a
@@ -122,7 +123,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
     labels = -np.ones(X.shape[0], dtype=np.int)
 
     # A list of all core samples found.
-    core_samples = np.flatnonzero(n_neighbors > min_samples)
+    core_samples = np.flatnonzero(n_neighbors >= min_samples)
     index_order = core_samples[random_state.permutation(core_samples.shape[0])]
 
     # label_num is the label given to the new cluster
@@ -170,7 +171,8 @@ class DBSCAN(BaseEstimator, ClusterMixin):
         as in the same neighborhood.
     min_samples : int, optional
         The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point.
+        to be considered as a core point. This number is inclusive of the
+        core point.
     metric : string, or callable
         The metric to use when calculating distance between instances in a
         feature array. If metric is a string or callable, it must be one of
@@ -234,8 +236,8 @@ def fit(self, X, y=None, sample_weight=None):
             A feature array, or array of distances between samples if
             ``metric='precomputed'``.
         sample_weight : array, shape (n_samples,), optional
-            Weight of each sample, such that a sample with weight greater
-            than ``min_samples`` is automatically a core sample; a sample with
+            Weight of each sample, such that a sample with weight at least
+            ``min_samples`` is automatically a core sample; a sample with
             negative weight may inhibit its eps-neighbor from being core.
             Note that weights are absolute, and default to 1.
         """
@@ -260,8 +262,8 @@ def fit_predict(self, X, y=None, sample_weight=None):
             A feature array, or array of distances between samples if
             ``metric='precomputed'``.
         sample_weight : array, shape (n_samples,), optional
-            Weight of each sample, such that a sample with weight greater
-            than ``min_samples`` is automatically a core sample; a sample with
+            Weight of each sample, such that a sample with weight at least
+            ``min_samples`` is automatically a core sample; a sample with
             negative weight may inhibit its eps-neighbor from being core.
             Note that weights are absolute, and default to 1.
 
diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py
index cb2c449b44788..d525e0c4fa9db 100644
--- a/sklearn/cluster/tests/test_dbscan.py
+++ b/sklearn/cluster/tests/test_dbscan.py
@@ -12,6 +12,8 @@
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_in
+from sklearn.utils.testing import assert_not_in
 from sklearn.cluster.dbscan_ import DBSCAN
 from sklearn.cluster.dbscan_ import dbscan
 from sklearn.cluster.tests.common import generate_clustered_data
@@ -185,6 +187,17 @@ def test_pickle():
     assert_equal(type(pickle.loads(s)), obj.__class__)
 
 
+def test_boundaries():
+    # ensure min_samples is inclusive of core point
+    core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
+    assert_in(0, core)
+    # ensure eps is inclusive of circumference
+    core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
+    assert_in(0, core)
+    core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
+    assert_not_in(0, core)
+
+
 def test_weighted_dbscan():
     # ensure sample_weight is validated
     assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2])
@@ -192,26 +205,26 @@ def test_weighted_dbscan():
 
     # ensure sample_weight has an effect
     assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
-                                  min_samples=5)[0])
+                                  min_samples=6)[0])
     assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
-                                  min_samples=5)[0])
+                                  min_samples=6)[0])
     assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
-                                   min_samples=5)[0])
+                                   min_samples=6)[0])
     assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
-                                      min_samples=5)[0])
+                                      min_samples=6)[0])
 
     # points within eps of each other:
     assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
-                                      sample_weight=[5, 1], min_samples=5)[0])
+                                      sample_weight=[5, 1], min_samples=6)[0])
     # and effect of non-positive and non-integer sample_weight:
     assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
-                                  eps=1.5, min_samples=5)[0])
-    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5, 0.1],
-                                      eps=1.5, min_samples=5)[0])
+                                  eps=1.5, min_samples=6)[0])
+    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1],
+                                      eps=1.5, min_samples=6)[0])
     assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
-                                      eps=1.5, min_samples=5)[0])
+                                      eps=1.5, min_samples=6)[0])
     assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
-                                  eps=1.5, min_samples=5)[0])
+                                  eps=1.5, min_samples=6)[0])
 
     # for non-negative sample_weight, cores should be identical to repetition
     rng = np.random.RandomState(42)

From b875625eccd26bc2d10664d0a57a381f32fa89ff Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 5 Mar 2015 11:13:54 +1100
Subject: [PATCH 2/2] TST fix test_dbscan_no_core_samples condition

---
 sklearn/cluster/tests/test_dbscan.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py
index d525e0c4fa9db..81073b616f76d 100644
--- a/sklearn/cluster/tests/test_dbscan.py
+++ b/sklearn/cluster/tests/test_dbscan.py
@@ -81,11 +81,11 @@ def test_dbscan_sparse():
 
 def test_dbscan_no_core_samples():
     rng = np.random.RandomState(0)
-    X = rng.rand(40, 10)
+    X = rng.rand(15, 10)
     X[X < .8] = 0
 
     for X_ in [X, sparse.csr_matrix(X)]:
-        db = DBSCAN().fit(X_)
+        db = DBSCAN(min_samples=50).fit(X_)
         assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
         assert_array_equal(db.labels_, -1)
         assert_equal(db.core_sample_indices_.shape, (0,))