From e87bcfed1e7269bfad22df0ee6e376aab00fb553 Mon Sep 17 00:00:00 2001
From: Erich Schubert <kno10@users.noreply.github.com>
Date: Thu, 28 Mar 2019 17:07:49 +0100
Subject: [PATCH] Clarify eps parameter misunderstanding

As seen here: https://stackoverflow.com/a/55388827/1939754
the old description of the eps parameter can be misunderstood as a maximum distance of any two points.

Also add a reference that discusses parameterization.
---
 doc/modules/clustering.rst | 18 +++++++++++++++++-
 sklearn/cluster/dbscan_.py | 22 ++++++++++++++++++----
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 9a77f657e2257..559dfeb8b89d7 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -752,6 +752,18 @@ Any core sample is part of a cluster, by definition. Any sample that is not a
 core sample, and is at least ``eps`` in distance from any core sample, is
 considered an outlier by the algorithm.
 
+While the parameter ``min_samples`` primarily controls how tolerant the
+algorithm is towards noise (on noisy and large data sets it may be desiable
+to increase this parameter), the parameter ``eps`` is *crucial to choose
+appropriately* for the data set and distance function and usually cannot be
+left at the default value. It controls the local neighborhood of the points.
+When chosen too small, most data will not be clustered at all (and labeled
+as ``-1`` for "noise"). When chosen too large, it causes close clusters to
+be merged into one cluster, and eventually the entire data set to be returned
+as a single cluster. Some heuristics for choosing this parameter have been
+discussed in literature, for example based on a knee in the nearest neighbor
+distances plot (as discussed in the references below).
+
 In the figure below, the color indicates cluster membership, with large circles
 indicating core samples found by the algorithm. Smaller circles are non-core
 samples that are still part of a cluster. Moreover, the outliers are indicated
@@ -793,7 +805,7 @@ by black points below.
 
     This implementation is by default not memory efficient because it constructs
     a full pairwise similarity matrix in the case where kd-trees or ball-trees cannot
-    be used (e.g. with sparse matrices). This matrix will consume n^2 floats.
+    be used (e.g., with sparse matrices). This matrix will consume n^2 floats.
     A couple of mechanisms for getting around this are:
 
     - A sparse radius neighborhood graph (where missing entries are presumed to
@@ -814,6 +826,10 @@ by black points below.
    In Proceedings of the 2nd International Conference on Knowledge Discovery
    and Data Mining, Portland, OR, AAAI Press, pp. 226–231. 1996
 
+ * "DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
+   Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
+   In ACM Transactions on Database Systems (TODS), 42(3), 19.
+
 .. _birch:
 
 Birch
diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index f21beb3f91453..5b1d7c9c6c2ee 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -35,8 +35,11 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
         ``metric='precomputed'``.
 
     eps : float, optional
-        The maximum distance between two samples for them to be considered
-        as in the same neighborhood.
+        The maximum distance between two samples for one to be considered
+        as in the neighborhood of the other. This is not a maximum bound
+        on the distances of points within a cluster. This is the most
+        important DBSCAN parameter to choose appropriately for your data set
+        and distance function.
 
     min_samples : int, optional
         The number of samples (or total weight) in a neighborhood for a point
@@ -128,6 +131,10 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
     Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
     In: Proceedings of the 2nd International Conference on Knowledge Discovery
     and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
+
+    Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
+    DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
+    ACM Transactions on Database Systems (TODS), 42(3), 19.
     """
     if not eps > 0.0:
         raise ValueError("eps must be positive.")
@@ -194,8 +201,11 @@ class DBSCAN(BaseEstimator, ClusterMixin):
     Parameters
     ----------
     eps : float, optional
-        The maximum distance between two samples for them to be considered
-        as in the same neighborhood.
+        The maximum distance between two samples for one to be considered
+        as in the neighborhood of the other. This is not a maximum bound
+        on the distances of points within a cluster. This is the most
+        important DBSCAN parameter to choose appropriately for your data set
+        and distance function.
 
     min_samples : int, optional
         The number of samples (or total weight) in a neighborhood for a point
@@ -299,6 +309,10 @@ class DBSCAN(BaseEstimator, ClusterMixin):
     Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
     In: Proceedings of the 2nd International Conference on Knowledge Discovery
     and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
+
+    Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
+    DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
+    ACM Transactions on Database Systems (TODS), 42(3), 19.
     """
 
     def __init__(self, eps=0.5, min_samples=5, metric='euclidean',