From e87bcfed1e7269bfad22df0ee6e376aab00fb553 Mon Sep 17 00:00:00 2001 From: Erich Schubert Date: Thu, 28 Mar 2019 17:07:49 +0100 Subject: [PATCH] Clarify eps parameter misunderstanding As seen here: https://stackoverflow.com/a/55388827/1939754 the old description of the eps parameter can be misunderstood as a maximum distance of any two points. Also add a reference that discusses parameterization. --- doc/modules/clustering.rst | 18 +++++++++++++++++- sklearn/cluster/dbscan_.py | 22 ++++++++++++++++++---- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 9a77f657e2257..559dfeb8b89d7 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -752,6 +752,18 @@ Any core sample is part of a cluster, by definition. Any sample that is not a core sample, and is at least ``eps`` in distance from any core sample, is considered an outlier by the algorithm. +While the parameter ``min_samples`` primarily controls how tolerant the +algorithm is towards noise (on noisy and large data sets it may be desiable +to increase this parameter), the parameter ``eps`` is *crucial to choose +appropriately* for the data set and distance function and usually cannot be +left at the default value. It controls the local neighborhood of the points. +When chosen too small, most data will not be clustered at all (and labeled +as ``-1`` for "noise"). When chosen too large, it causes close clusters to +be merged into one cluster, and eventually the entire data set to be returned +as a single cluster. Some heuristics for choosing this parameter have been +discussed in literature, for example based on a knee in the nearest neighbor +distances plot (as discussed in the references below). + In the figure below, the color indicates cluster membership, with large circles indicating core samples found by the algorithm. Smaller circles are non-core samples that are still part of a cluster. Moreover, the outliers are indicated @@ -793,7 +805,7 @@ by black points below. This implementation is by default not memory efficient because it constructs a full pairwise similarity matrix in the case where kd-trees or ball-trees cannot - be used (e.g. with sparse matrices). This matrix will consume n^2 floats. + be used (e.g., with sparse matrices). This matrix will consume n^2 floats. A couple of mechanisms for getting around this are: - A sparse radius neighborhood graph (where missing entries are presumed to @@ -814,6 +826,10 @@ by black points below. In Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226–231. 1996 + * "DBSCAN revisited, revisited: why and how you should (still) use DBSCAN. + Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). + In ACM Transactions on Database Systems (TODS), 42(3), 19. + .. _birch: Birch diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index f21beb3f91453..5b1d7c9c6c2ee 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -35,8 +35,11 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, ``metric='precomputed'``. eps : float, optional - The maximum distance between two samples for them to be considered - as in the same neighborhood. + The maximum distance between two samples for one to be considered + as in the neighborhood of the other. This is not a maximum bound + on the distances of points within a cluster. This is the most + important DBSCAN parameter to choose appropriately for your data set + and distance function. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point @@ -128,6 +131,10 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, Algorithm for Discovering Clusters in Large Spatial Databases with Noise". In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 + + Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). + DBSCAN revisited, revisited: why and how you should (still) use DBSCAN. + ACM Transactions on Database Systems (TODS), 42(3), 19. """ if not eps > 0.0: raise ValueError("eps must be positive.") @@ -194,8 +201,11 @@ class DBSCAN(BaseEstimator, ClusterMixin): Parameters ---------- eps : float, optional - The maximum distance between two samples for them to be considered - as in the same neighborhood. + The maximum distance between two samples for one to be considered + as in the neighborhood of the other. This is not a maximum bound + on the distances of points within a cluster. This is the most + important DBSCAN parameter to choose appropriately for your data set + and distance function. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point @@ -299,6 +309,10 @@ class DBSCAN(BaseEstimator, ClusterMixin): Algorithm for Discovering Clusters in Large Spatial Databases with Noise". In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 + + Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). + DBSCAN revisited, revisited: why and how you should (still) use DBSCAN. + ACM Transactions on Database Systems (TODS), 42(3), 19. """ def __init__(self, eps=0.5, min_samples=5, metric='euclidean',