From 365dca2e452f3524c683e069b95b78a6ac3a2b29 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Thu, 6 Jul 2023 13:34:04 +0200
Subject: [PATCH 1/4] Doc extended with info about space complexity

---
 sklearn/cluster/_dbscan.py | 82 +++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index e3ba62dbfdf01..092214fc88a8c 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -168,33 +168,36 @@ def dbscan(
 class DBSCAN(ClusterMixin, BaseEstimator):
     """Perform DBSCAN clustering from vector array or distance matrix.
 
-    DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
-    Finds core samples of high density and expands clusters from them.
-    Good for data which contains clusters of similar density.
+    DBSCAN - Density-Based Spatial Clustering of Applications with noise. Finds
+    core samples of high density and expands clusters from them. Good for data
+    which contains clusters of similar density.
+
+    The worst case memory complexity of DBSCAN is :math:`O({n}^2)`, which can
+    occur when the `eps` param is large and `min_samples` is low.
 
     Read more in the :ref:`User Guide <dbscan>`.
 
     Parameters
     ----------
     eps : float, default=0.5
-        The maximum distance between two samples for one to be considered
-        as in the neighborhood of the other. This is not a maximum bound
-        on the distances of points within a cluster. This is the most
-        important DBSCAN parameter to choose appropriately for your data set
-        and distance function.
+        The maximum distance between two samples for one to be considered as in
+        the neighborhood of the other. This is not a maximum bound on the
+        distances of points within a cluster. This is the most important DBSCAN
+        parameter to choose appropriately for your data set and distance
+        function.
 
     min_samples : int, default=5
-        The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point. This includes the point itself.
+        The number of samples (or total weight) in a neighborhood for a point to
+        be considered as a core point. This includes the point itself.
 
     metric : str, or callable, default='euclidean'
         The metric to use when calculating distance between instances in a
-        feature array. If metric is a string or callable, it must be one of
-        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
-        its metric parameter.
-        If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square. X may be a :term:`sparse graph`, in which
-        case only "nonzero" elements may be considered neighbors for DBSCAN.
+        feature array. If metric is a string or callable, it must be one of the
+        options allowed by :func:`sklearn.metrics.pairwise_distances` for its
+        metric parameter. If metric is "precomputed", X is assumed to be a
+        distance matrix and must be square. X may be a :term:`sparse graph`, in
+        which case only "nonzero" elements may be considered neighbors for
+        DBSCAN.
 
         .. versionadded:: 0.17
            metric *precomputed* to accept precomputed sparse matrix.
@@ -205,15 +208,14 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         .. versionadded:: 0.19
 
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
-        The algorithm to be used by the NearestNeighbors module
-        to compute pointwise distances and find nearest neighbors.
-        See NearestNeighbors module documentation for details.
+        The algorithm to be used by the NearestNeighbors module to compute
+        pointwise distances and find nearest neighbors. See NearestNeighbors
+        module documentation for details.
 
     leaf_size : int, default=30
-        Leaf size passed to BallTree or cKDTree. This can affect the speed
-        of the construction and query, as well as the memory required
-        to store the tree. The optimal value depends
-        on the nature of the problem.
+        Leaf size passed to BallTree or cKDTree. This can affect the speed of
+        the construction and query, as well as the memory required to store the
+        tree. The optimal value depends on the nature of the problem.
 
     p : float, default=None
         The power of the Minkowski metric to be used to calculate distance
@@ -221,10 +223,9 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         distance).
 
     n_jobs : int, default=None
-        The number of parallel jobs to run.
-        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
-        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
-        for more details.
+        The number of parallel jobs to run. ``None`` means 1 unless in a
+        :obj:`joblib.parallel_backend` context. ``-1`` means using all
+        processors. See :term:`Glossary <n_jobs>` for more details.
 
     Attributes
     ----------
@@ -235,8 +236,8 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         Copy of each core sample found by training.
 
     labels_ : ndarray of shape (n_samples)
-        Cluster labels for each point in the dataset given to fit().
-        Noisy samples are given the label -1.
+        Cluster labels for each point in the dataset given to fit(). Noisy
+        samples are given the label -1.
 
     n_features_in_ : int
         Number of features seen during :term:`fit`.
@@ -244,8 +245,8 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         .. versionadded:: 0.24
 
     feature_names_in_ : ndarray of shape (`n_features_in_`,)
-        Names of features seen during :term:`fit`. Defined only when `X`
-        has feature names that are all strings.
+        Names of features seen during :term:`fit`. Defined only when `X` has
+        feature names that are all strings.
 
         .. versionadded:: 1.0
 
@@ -262,12 +263,11 @@ class DBSCAN(ClusterMixin, BaseEstimator):
     This implementation bulk-computes all neighborhood queries, which increases
     the memory complexity to O(n.d) where d is the average number of neighbors,
     while original DBSCAN had memory complexity O(n). It may attract a higher
-    memory complexity when querying these nearest neighborhoods, depending
-    on the ``algorithm``.
+    memory complexity when querying these nearest neighborhoods, depending on
+    the ``algorithm``.
 
-    One way to avoid the query complexity is to pre-compute sparse
-    neighborhoods in chunks using
-    :func:`NearestNeighbors.radius_neighbors_graph
+    One way to avoid the query complexity is to pre-compute sparse neighborhoods
+    in chunks using :func:`NearestNeighbors.radius_neighbors_graph
     <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
     ``mode='distance'``, then using ``metric='precomputed'`` here.
 
@@ -279,16 +279,16 @@ class DBSCAN(ClusterMixin, BaseEstimator):
 
     References
     ----------
-    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
-    Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
+    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based Algorithm
+    for Discovering Clusters in Large Spatial Databases with Noise"
     <https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
     In: Proceedings of the 2nd International Conference on Knowledge Discovery
     and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
 
     Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
-    :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
-    <10.1145/3068335>`
-    ACM Transactions on Database Systems (TODS), 42(3), 19.
+    :doi:`"DBSCAN revisited, revisited: why and how you should (still) use
+    DBSCAN." <10.1145/3068335>` ACM Transactions on Database Systems (TODS),
+    42(3), 19.
 
     Examples
     --------

From cbe4b38dd8c1176ea7f2b8b29dfa039748ed0cf2 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Thu, 6 Jul 2023 14:50:59 +0200
Subject: [PATCH 2/4] formatting

---
 sklearn/cluster/_dbscan.py | 79 ++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 38 deletions(-)

diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index 092214fc88a8c..a05a2c39ae266 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -168,9 +168,9 @@ def dbscan(
 class DBSCAN(ClusterMixin, BaseEstimator):
     """Perform DBSCAN clustering from vector array or distance matrix.
 
-    DBSCAN - Density-Based Spatial Clustering of Applications with noise. Finds
-    core samples of high density and expands clusters from them. Good for data
-    which contains clusters of similar density.
+    DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
+    Finds core samples of high density and expands clusters from them.
+    Good for data which contains clusters of similar density.
 
     The worst case memory complexity of DBSCAN is :math:`O({n}^2)`, which can
     occur when the `eps` param is large and `min_samples` is low.
@@ -180,24 +180,24 @@ class DBSCAN(ClusterMixin, BaseEstimator):
     Parameters
     ----------
     eps : float, default=0.5
-        The maximum distance between two samples for one to be considered as in
-        the neighborhood of the other. This is not a maximum bound on the
-        distances of points within a cluster. This is the most important DBSCAN
-        parameter to choose appropriately for your data set and distance
-        function.
+        The maximum distance between two samples for one to be considered
+        as in the neighborhood of the other. This is not a maximum bound
+        on the distances of points within a cluster. This is the most
+        important DBSCAN parameter to choose appropriately for your data set
+        and distance function.
 
     min_samples : int, default=5
-        The number of samples (or total weight) in a neighborhood for a point to
-        be considered as a core point. This includes the point itself.
+        The number of samples (or total weight) in a neighborhood for a point
+        to be considered as a core point. This includes the point itself.
 
     metric : str, or callable, default='euclidean'
         The metric to use when calculating distance between instances in a
-        feature array. If metric is a string or callable, it must be one of the
-        options allowed by :func:`sklearn.metrics.pairwise_distances` for its
-        metric parameter. If metric is "precomputed", X is assumed to be a
-        distance matrix and must be square. X may be a :term:`sparse graph`, in
-        which case only "nonzero" elements may be considered neighbors for
-        DBSCAN.
+        feature array. If metric is a string or callable, it must be one of
+        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
+        its metric parameter.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors for DBSCAN.
 
         .. versionadded:: 0.17
            metric *precomputed* to accept precomputed sparse matrix.
@@ -208,14 +208,15 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         .. versionadded:: 0.19
 
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
-        The algorithm to be used by the NearestNeighbors module to compute
-        pointwise distances and find nearest neighbors. See NearestNeighbors
-        module documentation for details.
+        The algorithm to be used by the NearestNeighbors module
+        to compute pointwise distances and find nearest neighbors.
+        See NearestNeighbors module documentation for details.
 
     leaf_size : int, default=30
-        Leaf size passed to BallTree or cKDTree. This can affect the speed of
-        the construction and query, as well as the memory required to store the
-        tree. The optimal value depends on the nature of the problem.
+        Leaf size passed to BallTree or cKDTree. This can affect the speed
+        of the construction and query, as well as the memory required
+        to store the tree. The optimal value depends
+        on the nature of the problem.
 
     p : float, default=None
         The power of the Minkowski metric to be used to calculate distance
@@ -223,9 +224,10 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         distance).
 
     n_jobs : int, default=None
-        The number of parallel jobs to run. ``None`` means 1 unless in a
-        :obj:`joblib.parallel_backend` context. ``-1`` means using all
-        processors. See :term:`Glossary <n_jobs>` for more details.
+        The number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
 
     Attributes
     ----------
@@ -236,8 +238,8 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         Copy of each core sample found by training.
 
     labels_ : ndarray of shape (n_samples)
-        Cluster labels for each point in the dataset given to fit(). Noisy
-        samples are given the label -1.
+        Cluster labels for each point in the dataset given to fit().
+        Noisy samples are given the label -1.
 
     n_features_in_ : int
         Number of features seen during :term:`fit`.
@@ -245,8 +247,8 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         .. versionadded:: 0.24
 
     feature_names_in_ : ndarray of shape (`n_features_in_`,)
-        Names of features seen during :term:`fit`. Defined only when `X` has
-        feature names that are all strings.
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
 
         .. versionadded:: 1.0
 
@@ -263,11 +265,12 @@ class DBSCAN(ClusterMixin, BaseEstimator):
     This implementation bulk-computes all neighborhood queries, which increases
     the memory complexity to O(n.d) where d is the average number of neighbors,
     while original DBSCAN had memory complexity O(n). It may attract a higher
-    memory complexity when querying these nearest neighborhoods, depending on
-    the ``algorithm``.
+    memory complexity when querying these nearest neighborhoods, depending
+    on the ``algorithm``.
 
-    One way to avoid the query complexity is to pre-compute sparse neighborhoods
-    in chunks using :func:`NearestNeighbors.radius_neighbors_graph
+    One way to avoid the query complexity is to pre-compute sparse
+    neighborhoods in chunks using
+    :func:`NearestNeighbors.radius_neighbors_graph
     <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
     ``mode='distance'``, then using ``metric='precomputed'`` here.
 
@@ -279,16 +282,16 @@ class DBSCAN(ClusterMixin, BaseEstimator):
 
     References
     ----------
-    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based Algorithm
-    for Discovering Clusters in Large Spatial Databases with Noise"
+    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
+    Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
     <https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
     In: Proceedings of the 2nd International Conference on Knowledge Discovery
     and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
 
     Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
-    :doi:`"DBSCAN revisited, revisited: why and how you should (still) use
-    DBSCAN." <10.1145/3068335>` ACM Transactions on Database Systems (TODS),
-    42(3), 19.
+    :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
+    <10.1145/3068335>`
+    ACM Transactions on Database Systems (TODS), 42(3), 19.
 
     Examples
     --------

From 0b4004fa393e17c688f60a27475f11ab1cb588a4 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Fri, 7 Jul 2023 11:23:36 +0200
Subject: [PATCH 3/4] added clarification for param description 'min_samples'

---
 sklearn/cluster/_dbscan.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index a05a2c39ae266..3db574f2719b5 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -187,8 +187,11 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         and distance function.
 
     min_samples : int, default=5
-        The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point. This includes the point itself.
+        The number of samples (or total weight) in a neighborhood for a point to
+        be considered as a core point. This includes the point itself. If
+        min_samples is set to a higher value, DBSCAN will find denser clusters,
+        whereas if it is set to a lower value, the found clusters will be more
+        sparse.
 
     metric : str, or callable, default='euclidean'
         The metric to use when calculating distance between instances in a

From d200e03b5ab21e147e140598d03feaa9e508d20a Mon Sep 17 00:00:00 2001
From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com>
Date: Thu, 13 Jul 2023 15:59:38 +0200
Subject: [PATCH 4/4] Update sklearn/cluster/_dbscan.py

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/cluster/_dbscan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index 3db574f2719b5..10053a31ec1d9 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -189,7 +189,7 @@ class DBSCAN(ClusterMixin, BaseEstimator):
     min_samples : int, default=5
         The number of samples (or total weight) in a neighborhood for a point to
         be considered as a core point. This includes the point itself. If
-        min_samples is set to a higher value, DBSCAN will find denser clusters,
+        `min_samples` is set to a higher value, DBSCAN will find denser clusters,
         whereas if it is set to a lower value, the found clusters will be more
         sparse.