From cdb05778797c73020b347f509c9f7208e1d391f1 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 11 Jan 2015 01:51:29 +1100 Subject: [PATCH 1/2] FIX/TST boundary cases in dbscan --- sklearn/cluster/dbscan_.py | 16 ++++++++------ sklearn/cluster/tests/test_dbscan.py | 33 +++++++++++++++++++--------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index 896cf0c20d350..3e3049790eb54 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -34,7 +34,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point - to be considered as a core point. + to be considered as a core point. This number is inclusive of the + core point. metric : string, or callable The metric to use when calculating distance between instances in a @@ -122,7 +123,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', labels = -np.ones(X.shape[0], dtype=np.int) # A list of all core samples found. - core_samples = np.flatnonzero(n_neighbors > min_samples) + core_samples = np.flatnonzero(n_neighbors >= min_samples) index_order = core_samples[random_state.permutation(core_samples.shape[0])] # label_num is the label given to the new cluster @@ -170,7 +171,8 @@ class DBSCAN(BaseEstimator, ClusterMixin): as in the same neighborhood. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point - to be considered as a core point. + to be considered as a core point. This number is inclusive of the + core point. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of @@ -234,8 +236,8 @@ def fit(self, X, y=None, sample_weight=None): A feature array, or array of distances between samples if ``metric='precomputed'``. sample_weight : array, shape (n_samples,), optional - Weight of each sample, such that a sample with weight greater - than ``min_samples`` is automatically a core sample; a sample with + Weight of each sample, such that a sample with weight at least + ``min_samples`` is automatically a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. """ @@ -260,8 +262,8 @@ def fit_predict(self, X, y=None, sample_weight=None): A feature array, or array of distances between samples if ``metric='precomputed'``. sample_weight : array, shape (n_samples,), optional - Weight of each sample, such that a sample with weight greater - than ``min_samples`` is automatically a core sample; a sample with + Weight of each sample, such that a sample with weight at least + ``min_samples`` is automatically a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index cb2c449b44788..d525e0c4fa9db 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -12,6 +12,8 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_in +from sklearn.utils.testing import assert_not_in from sklearn.cluster.dbscan_ import DBSCAN from sklearn.cluster.dbscan_ import dbscan from sklearn.cluster.tests.common import generate_clustered_data @@ -185,6 +187,17 @@ def test_pickle(): assert_equal(type(pickle.loads(s)), obj.__class__) +def test_boundaries(): + # ensure min_samples is inclusive of core point + core, _ = dbscan([[0], [1]], eps=2, min_samples=2) + assert_in(0, core) + # ensure eps is inclusive of circumference + core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2) + assert_in(0, core) + core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2) + assert_not_in(0, core) + + def test_weighted_dbscan(): # ensure sample_weight is validated assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2]) @@ -192,26 +205,26 @@ def test_weighted_dbscan(): # ensure sample_weight has an effect assert_array_equal([], dbscan([[0], [1]], sample_weight=None, - min_samples=5)[0]) + min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], - min_samples=5)[0]) + min_samples=6)[0]) assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], - min_samples=5)[0]) + min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6], - min_samples=5)[0]) + min_samples=6)[0]) # points within eps of each other: assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5, - sample_weight=[5, 1], min_samples=5)[0]) + sample_weight=[5, 1], min_samples=6)[0]) # and effect of non-positive and non-integer sample_weight: assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0], - eps=1.5, min_samples=5)[0]) - assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5, 0.1], - eps=1.5, min_samples=5)[0]) + eps=1.5, min_samples=6)[0]) + assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], + eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0], - eps=1.5, min_samples=5)[0]) + eps=1.5, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1], - eps=1.5, min_samples=5)[0]) + eps=1.5, min_samples=6)[0]) # for non-negative sample_weight, cores should be identical to repetition rng = np.random.RandomState(42) From b875625eccd26bc2d10664d0a57a381f32fa89ff Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 5 Mar 2015 11:13:54 +1100 Subject: [PATCH 2/2] TST fix test_dbscan_no_core_samples condition --- sklearn/cluster/tests/test_dbscan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index d525e0c4fa9db..81073b616f76d 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -81,11 +81,11 @@ def test_dbscan_sparse(): def test_dbscan_no_core_samples(): rng = np.random.RandomState(0) - X = rng.rand(40, 10) + X = rng.rand(15, 10) X[X < .8] = 0 for X_ in [X, sparse.csr_matrix(X)]: - db = DBSCAN().fit(X_) + db = DBSCAN(min_samples=50).fit(X_) assert_array_equal(db.components_, np.empty((0, X_.shape[1]))) assert_array_equal(db.labels_, -1) assert_equal(db.core_sample_indices_.shape, (0,))