From e3024d536707baa5813dae7883817b91b91e9715 Mon Sep 17 00:00:00 2001 From: Erich Schubert Date: Sat, 10 Jan 2015 19:40:28 +0100 Subject: [PATCH 1/2] Do not shuffle in DBSCAN (warn if `random_state` is used). This makes little difference, and original DBSCAN did not shuffle. Warn if `random_state` is used. As is `random_state` encourages users to experiment with different randomization, as you would do with k-means. But in contrast to k-means, the output of DBSCAN is deterministic except for cluster enumeration and "rare" cases, where a point is on the border of two clusters at the same time. As this affects single points only, the measureable performance difference will be close to zero. Also, incorporate fix for minpts including the query point. --- sklearn/cluster/dbscan_.py | 55 +++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index 896cf0c20d350..10b5d66575426 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -8,11 +8,13 @@ # # License: BSD 3 clause +import warnings + import numpy as np from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances -from ..utils import check_random_state, check_array, check_consistent_length +from ..utils import check_array, check_consistent_length from ..neighbors import NearestNeighbors @@ -34,7 +36,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point - to be considered as a core point. + to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a @@ -60,13 +62,14 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', between points. sample_weight : array, shape (n_samples,), optional - Weight of each sample, such that a sample with weight greater - than ``min_samples`` is automatically a core sample; a sample with - negative weight may inhibit its eps-neighbor from being core. + Weight of each sample, such that a sample with a weight of at least + ``min_samples`` is by itself a core sample; a sample with negative + weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. random_state: numpy.RandomState, optional - The generator used to shuffle the samples. Defaults to numpy.random. + Ignored, will be removed in the next version. + (DBSCAN does not use random initialization). Returns ------- @@ -80,6 +83,10 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', ----- See examples/cluster/plot_dbscan.py for an example. + This implementation bulk-computes all neighborhood queries, which increases + the memory complexity to O(n.d) where d is the average number of neighbors, + while original DBSCAN had memory complexity O(n). + References ---------- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based @@ -89,15 +96,16 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', """ if not eps > 0.0: raise ValueError("eps must be positive.") + if random_state is not None: + warnings.warn("The parameter random_state is ignored " + + "and will be removed in the next version. " + + "(DBSCAN is deterministic except for rare border cases)") X = check_array(X, accept_sparse='csr') if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) - # If index order not given, create random order. - random_state = check_random_state(random_state) - # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) @@ -109,6 +117,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', leaf_size=leaf_size, metric=metric, p=p) neighbors_model.fit(X) + # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors(X, eps, return_distance=False) neighborhoods = np.array(neighborhoods) @@ -122,15 +131,14 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', labels = -np.ones(X.shape[0], dtype=np.int) # A list of all core samples found. - core_samples = np.flatnonzero(n_neighbors > min_samples) - index_order = core_samples[random_state.permutation(core_samples.shape[0])] + core_samples = np.flatnonzero(n_neighbors >= min_samples) # label_num is the label given to the new cluster label_num = 0 # Look at all samples and determine if they are core. # If they are then build a new cluster from them. - for index in index_order: + for index in core_samples: # Already classified if labels[index] != -1: continue @@ -170,7 +178,7 @@ class DBSCAN(BaseEstimator, ClusterMixin): as in the same neighborhood. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point - to be considered as a core point. + to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of @@ -178,8 +186,6 @@ class DBSCAN(BaseEstimator, ClusterMixin): metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. - random_state : numpy.RandomState, optional - The generator used to shuffle the samples. Defaults to numpy.random. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. @@ -189,6 +195,9 @@ class DBSCAN(BaseEstimator, ClusterMixin): of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. + random_state: numpy.RandomState, optional + Ignored, will be removed in the next version. + (DBSCAN does not use random initialization). Attributes ---------- @@ -206,6 +215,10 @@ class DBSCAN(BaseEstimator, ClusterMixin): ----- See examples/cluster/plot_dbscan.py for an example. + This implementation bulk-computes all neighborhood queries, which increases + the memory complexity to O(n.d) where d is the average number of neighbors, + while original DBSCAN had memory complexity O(n). + References ---------- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based @@ -234,9 +247,9 @@ def fit(self, X, y=None, sample_weight=None): A feature array, or array of distances between samples if ``metric='precomputed'``. sample_weight : array, shape (n_samples,), optional - Weight of each sample, such that a sample with weight greater - than ``min_samples`` is automatically a core sample; a sample with - negative weight may inhibit its eps-neighbor from being core. + Weight of each sample, such that a sample with a weight of at least + ``min_samples`` is by itself a core sample; a sample with negative + weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. """ X = check_array(X, accept_sparse='csr') @@ -260,9 +273,9 @@ def fit_predict(self, X, y=None, sample_weight=None): A feature array, or array of distances between samples if ``metric='precomputed'``. sample_weight : array, shape (n_samples,), optional - Weight of each sample, such that a sample with weight greater - than ``min_samples`` is automatically a core sample; a sample with - negative weight may inhibit its eps-neighbor from being core. + Weight of each sample, such that a sample with a weight of at least + ``min_samples`` is by itself a core sample; a sample with negative + weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. Returns From aa5a01da0d1737938089175232c3030242648632 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 11 Jan 2015 01:51:29 +1100 Subject: [PATCH 2/2] FIX/TST boundary cases in dbscan (closes #4073) --- sklearn/cluster/tests/test_dbscan.py | 35 +++++++++++++++++++--------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index cb2c449b44788..4fd7af3bc4110 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -12,6 +12,8 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_in +from sklearn.utils.testing import assert_not_in from sklearn.cluster.dbscan_ import DBSCAN from sklearn.cluster.dbscan_ import dbscan from sklearn.cluster.tests.common import generate_clustered_data @@ -83,7 +85,7 @@ def test_dbscan_no_core_samples(): X[X < .8] = 0 for X_ in [X, sparse.csr_matrix(X)]: - db = DBSCAN().fit(X_) + db = DBSCAN(min_samples=6).fit(X_) assert_array_equal(db.components_, np.empty((0, X_.shape[1]))) assert_array_equal(db.labels_, -1) assert_equal(db.core_sample_indices_.shape, (0,)) @@ -185,6 +187,17 @@ def test_pickle(): assert_equal(type(pickle.loads(s)), obj.__class__) +def test_boundaries(): + # ensure min_samples is inclusive of core point + core, _ = dbscan([[0], [1]], eps=2, min_samples=2) + assert_in(0, core) + # ensure eps is inclusive of circumference + core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2) + assert_in(0, core) + core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2) + assert_not_in(0, core) + + def test_weighted_dbscan(): # ensure sample_weight is validated assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2]) @@ -192,26 +205,26 @@ def test_weighted_dbscan(): # ensure sample_weight has an effect assert_array_equal([], dbscan([[0], [1]], sample_weight=None, - min_samples=5)[0]) + min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], - min_samples=5)[0]) + min_samples=6)[0]) assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], - min_samples=5)[0]) + min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6], - min_samples=5)[0]) + min_samples=6)[0]) # points within eps of each other: assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5, - sample_weight=[5, 1], min_samples=5)[0]) + sample_weight=[5, 1], min_samples=6)[0]) # and effect of non-positive and non-integer sample_weight: assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0], - eps=1.5, min_samples=5)[0]) - assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5, 0.1], - eps=1.5, min_samples=5)[0]) + eps=1.5, min_samples=6)[0]) + assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], + eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0], - eps=1.5, min_samples=5)[0]) + eps=1.5, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1], - eps=1.5, min_samples=5)[0]) + eps=1.5, min_samples=6)[0]) # for non-negative sample_weight, cores should be identical to repetition rng = np.random.RandomState(42)