From 365dca2e452f3524c683e069b95b78a6ac3a2b29 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 6 Jul 2023 13:34:04 +0200 Subject: [PATCH 1/4] Doc extended with info about space complexity --- sklearn/cluster/_dbscan.py | 82 +++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index e3ba62dbfdf01..092214fc88a8c 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -168,33 +168,36 @@ def dbscan( class DBSCAN(ClusterMixin, BaseEstimator): """Perform DBSCAN clustering from vector array or distance matrix. - DBSCAN - Density-Based Spatial Clustering of Applications with Noise. - Finds core samples of high density and expands clusters from them. - Good for data which contains clusters of similar density. + DBSCAN - Density-Based Spatial Clustering of Applications with noise. Finds + core samples of high density and expands clusters from them. Good for data + which contains clusters of similar density. + + The worst case memory complexity of DBSCAN is :math:`O({n}^2)`, which can + occur when the `eps` param is large and `min_samples` is low. Read more in the :ref:`User Guide `. Parameters ---------- eps : float, default=0.5 - The maximum distance between two samples for one to be considered - as in the neighborhood of the other. This is not a maximum bound - on the distances of points within a cluster. This is the most - important DBSCAN parameter to choose appropriately for your data set - and distance function. + The maximum distance between two samples for one to be considered as in + the neighborhood of the other. This is not a maximum bound on the + distances of points within a cluster. This is the most important DBSCAN + parameter to choose appropriately for your data set and distance + function. min_samples : int, default=5 - The number of samples (or total weight) in a neighborhood for a point - to be considered as a core point. This includes the point itself. + The number of samples (or total weight) in a neighborhood for a point to + be considered as a core point. This includes the point itself. metric : str, or callable, default='euclidean' The metric to use when calculating distance between instances in a - feature array. If metric is a string or callable, it must be one of - the options allowed by :func:`sklearn.metrics.pairwise_distances` for - its metric parameter. - If metric is "precomputed", X is assumed to be a distance matrix and - must be square. X may be a :term:`sparse graph`, in which - case only "nonzero" elements may be considered neighbors for DBSCAN. + feature array. If metric is a string or callable, it must be one of the + options allowed by :func:`sklearn.metrics.pairwise_distances` for its + metric parameter. If metric is "precomputed", X is assumed to be a + distance matrix and must be square. X may be a :term:`sparse graph`, in + which case only "nonzero" elements may be considered neighbors for + DBSCAN. .. versionadded:: 0.17 metric *precomputed* to accept precomputed sparse matrix. @@ -205,15 +208,14 @@ class DBSCAN(ClusterMixin, BaseEstimator): .. versionadded:: 0.19 algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' - The algorithm to be used by the NearestNeighbors module - to compute pointwise distances and find nearest neighbors. - See NearestNeighbors module documentation for details. + The algorithm to be used by the NearestNeighbors module to compute + pointwise distances and find nearest neighbors. See NearestNeighbors + module documentation for details. leaf_size : int, default=30 - Leaf size passed to BallTree or cKDTree. This can affect the speed - of the construction and query, as well as the memory required - to store the tree. The optimal value depends - on the nature of the problem. + Leaf size passed to BallTree or cKDTree. This can affect the speed of + the construction and query, as well as the memory required to store the + tree. The optimal value depends on the nature of the problem. p : float, default=None The power of the Minkowski metric to be used to calculate distance @@ -221,10 +223,9 @@ class DBSCAN(ClusterMixin, BaseEstimator): distance). n_jobs : int, default=None - The number of parallel jobs to run. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. + The number of parallel jobs to run. ``None`` means 1 unless in a + :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. See :term:`Glossary ` for more details. Attributes ---------- @@ -235,8 +236,8 @@ class DBSCAN(ClusterMixin, BaseEstimator): Copy of each core sample found by training. labels_ : ndarray of shape (n_samples) - Cluster labels for each point in the dataset given to fit(). - Noisy samples are given the label -1. + Cluster labels for each point in the dataset given to fit(). Noisy + samples are given the label -1. n_features_in_ : int Number of features seen during :term:`fit`. @@ -244,8 +245,8 @@ class DBSCAN(ClusterMixin, BaseEstimator): .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) - Names of features seen during :term:`fit`. Defined only when `X` - has feature names that are all strings. + Names of features seen during :term:`fit`. Defined only when `X` has + feature names that are all strings. .. versionadded:: 1.0 @@ -262,12 +263,11 @@ class DBSCAN(ClusterMixin, BaseEstimator): This implementation bulk-computes all neighborhood queries, which increases the memory complexity to O(n.d) where d is the average number of neighbors, while original DBSCAN had memory complexity O(n). It may attract a higher - memory complexity when querying these nearest neighborhoods, depending - on the ``algorithm``. + memory complexity when querying these nearest neighborhoods, depending on + the ``algorithm``. - One way to avoid the query complexity is to pre-compute sparse - neighborhoods in chunks using - :func:`NearestNeighbors.radius_neighbors_graph + One way to avoid the query complexity is to pre-compute sparse neighborhoods + in chunks using :func:`NearestNeighbors.radius_neighbors_graph ` with ``mode='distance'``, then using ``metric='precomputed'`` here. @@ -279,16 +279,16 @@ class DBSCAN(ClusterMixin, BaseEstimator): References ---------- - Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based - Algorithm for Discovering Clusters in Large Spatial Databases with Noise" + Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based Algorithm + for Discovering Clusters in Large Spatial Databases with Noise" `_. In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). - :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN." - <10.1145/3068335>` - ACM Transactions on Database Systems (TODS), 42(3), 19. + :doi:`"DBSCAN revisited, revisited: why and how you should (still) use + DBSCAN." <10.1145/3068335>` ACM Transactions on Database Systems (TODS), + 42(3), 19. Examples -------- From cbe4b38dd8c1176ea7f2b8b29dfa039748ed0cf2 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 6 Jul 2023 14:50:59 +0200 Subject: [PATCH 2/4] formatting --- sklearn/cluster/_dbscan.py | 79 ++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 092214fc88a8c..a05a2c39ae266 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -168,9 +168,9 @@ def dbscan( class DBSCAN(ClusterMixin, BaseEstimator): """Perform DBSCAN clustering from vector array or distance matrix. - DBSCAN - Density-Based Spatial Clustering of Applications with noise. Finds - core samples of high density and expands clusters from them. Good for data - which contains clusters of similar density. + DBSCAN - Density-Based Spatial Clustering of Applications with Noise. + Finds core samples of high density and expands clusters from them. + Good for data which contains clusters of similar density. The worst case memory complexity of DBSCAN is :math:`O({n}^2)`, which can occur when the `eps` param is large and `min_samples` is low. @@ -180,24 +180,24 @@ class DBSCAN(ClusterMixin, BaseEstimator): Parameters ---------- eps : float, default=0.5 - The maximum distance between two samples for one to be considered as in - the neighborhood of the other. This is not a maximum bound on the - distances of points within a cluster. This is the most important DBSCAN - parameter to choose appropriately for your data set and distance - function. + The maximum distance between two samples for one to be considered + as in the neighborhood of the other. This is not a maximum bound + on the distances of points within a cluster. This is the most + important DBSCAN parameter to choose appropriately for your data set + and distance function. min_samples : int, default=5 - The number of samples (or total weight) in a neighborhood for a point to - be considered as a core point. This includes the point itself. + The number of samples (or total weight) in a neighborhood for a point + to be considered as a core point. This includes the point itself. metric : str, or callable, default='euclidean' The metric to use when calculating distance between instances in a - feature array. If metric is a string or callable, it must be one of the - options allowed by :func:`sklearn.metrics.pairwise_distances` for its - metric parameter. If metric is "precomputed", X is assumed to be a - distance matrix and must be square. X may be a :term:`sparse graph`, in - which case only "nonzero" elements may be considered neighbors for - DBSCAN. + feature array. If metric is a string or callable, it must be one of + the options allowed by :func:`sklearn.metrics.pairwise_distances` for + its metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors for DBSCAN. .. versionadded:: 0.17 metric *precomputed* to accept precomputed sparse matrix. @@ -208,14 +208,15 @@ class DBSCAN(ClusterMixin, BaseEstimator): .. versionadded:: 0.19 algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' - The algorithm to be used by the NearestNeighbors module to compute - pointwise distances and find nearest neighbors. See NearestNeighbors - module documentation for details. + The algorithm to be used by the NearestNeighbors module + to compute pointwise distances and find nearest neighbors. + See NearestNeighbors module documentation for details. leaf_size : int, default=30 - Leaf size passed to BallTree or cKDTree. This can affect the speed of - the construction and query, as well as the memory required to store the - tree. The optimal value depends on the nature of the problem. + Leaf size passed to BallTree or cKDTree. This can affect the speed + of the construction and query, as well as the memory required + to store the tree. The optimal value depends + on the nature of the problem. p : float, default=None The power of the Minkowski metric to be used to calculate distance @@ -223,9 +224,10 @@ class DBSCAN(ClusterMixin, BaseEstimator): distance). n_jobs : int, default=None - The number of parallel jobs to run. ``None`` means 1 unless in a - :obj:`joblib.parallel_backend` context. ``-1`` means using all - processors. See :term:`Glossary ` for more details. + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. Attributes ---------- @@ -236,8 +238,8 @@ class DBSCAN(ClusterMixin, BaseEstimator): Copy of each core sample found by training. labels_ : ndarray of shape (n_samples) - Cluster labels for each point in the dataset given to fit(). Noisy - samples are given the label -1. + Cluster labels for each point in the dataset given to fit(). + Noisy samples are given the label -1. n_features_in_ : int Number of features seen during :term:`fit`. @@ -245,8 +247,8 @@ class DBSCAN(ClusterMixin, BaseEstimator): .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) - Names of features seen during :term:`fit`. Defined only when `X` has - feature names that are all strings. + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. .. versionadded:: 1.0 @@ -263,11 +265,12 @@ class DBSCAN(ClusterMixin, BaseEstimator): This implementation bulk-computes all neighborhood queries, which increases the memory complexity to O(n.d) where d is the average number of neighbors, while original DBSCAN had memory complexity O(n). It may attract a higher - memory complexity when querying these nearest neighborhoods, depending on - the ``algorithm``. + memory complexity when querying these nearest neighborhoods, depending + on the ``algorithm``. - One way to avoid the query complexity is to pre-compute sparse neighborhoods - in chunks using :func:`NearestNeighbors.radius_neighbors_graph + One way to avoid the query complexity is to pre-compute sparse + neighborhoods in chunks using + :func:`NearestNeighbors.radius_neighbors_graph ` with ``mode='distance'``, then using ``metric='precomputed'`` here. @@ -279,16 +282,16 @@ class DBSCAN(ClusterMixin, BaseEstimator): References ---------- - Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based Algorithm - for Discovering Clusters in Large Spatial Databases with Noise" + Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based + Algorithm for Discovering Clusters in Large Spatial Databases with Noise" `_. In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). - :doi:`"DBSCAN revisited, revisited: why and how you should (still) use - DBSCAN." <10.1145/3068335>` ACM Transactions on Database Systems (TODS), - 42(3), 19. + :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN." + <10.1145/3068335>` + ACM Transactions on Database Systems (TODS), 42(3), 19. Examples -------- From 0b4004fa393e17c688f60a27475f11ab1cb588a4 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 7 Jul 2023 11:23:36 +0200 Subject: [PATCH 3/4] added clarification for param description 'min_samples' --- sklearn/cluster/_dbscan.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index a05a2c39ae266..3db574f2719b5 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -187,8 +187,11 @@ class DBSCAN(ClusterMixin, BaseEstimator): and distance function. min_samples : int, default=5 - The number of samples (or total weight) in a neighborhood for a point - to be considered as a core point. This includes the point itself. + The number of samples (or total weight) in a neighborhood for a point to + be considered as a core point. This includes the point itself. If + min_samples is set to a higher value, DBSCAN will find denser clusters, + whereas if it is set to a lower value, the found clusters will be more + sparse. metric : str, or callable, default='euclidean' The metric to use when calculating distance between instances in a From d200e03b5ab21e147e140598d03feaa9e508d20a Mon Sep 17 00:00:00 2001 From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> Date: Thu, 13 Jul 2023 15:59:38 +0200 Subject: [PATCH 4/4] Update sklearn/cluster/_dbscan.py Co-authored-by: Guillaume Lemaitre --- sklearn/cluster/_dbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 3db574f2719b5..10053a31ec1d9 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -189,7 +189,7 @@ class DBSCAN(ClusterMixin, BaseEstimator): min_samples : int, default=5 The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If - min_samples is set to a higher value, DBSCAN will find denser clusters, + `min_samples` is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse.