diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 7764bff94582f..a5075f8a99a31 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -6,7 +6,7 @@ # SPDX-License-Identifier: BSD-3-Clause import warnings -from numbers import Integral, Real +from numbers import Integral, Number, Real import numpy as np from scipy import sparse @@ -14,6 +14,7 @@ from ..base import BaseEstimator, ClusterMixin, _fit_context from ..metrics.pairwise import _VALID_METRICS from ..neighbors import NearestNeighbors +from ..utils import check_random_state from ..utils._param_validation import Interval, StrOptions, validate_params from ..utils.validation import _check_sample_weight, validate_data from ._dbscan_inner import dbscan_inner @@ -23,6 +24,8 @@ { "X": ["array-like", "sparse matrix"], "sample_weight": ["array-like", None], + "subsample": [Interval(Real, 0, 1, closed="neither"), None], + "random_state": ["random_state"], }, prefer_skip_nested_validation=False, ) @@ -38,6 +41,8 @@ def dbscan( p=2, sample_weight=None, n_jobs=None, + subsample=None, + random_state=None, ): """Perform DBSCAN clustering from vector array or distance matrix. @@ -104,6 +109,20 @@ def dbscan( If precomputed distance are used, parallel execution is not available and thus n_jobs will have no effect. + subsample : float, default=None + Should be between [0, 1]. By default, no sampling is done. + Sampling probability, representing the proportion of the dataset + that can be labeled a core sample. The lower the subsample, the + less memory and computation is used. + See: Jang, J. and Jiang, H. "DBSCAN++: Towards fast and scalable + density clustering". Proceedings of the 36th International Conference + on Machine Learning, 2019. + + random_state : int, RandomState instance or None, default=None + Only relevant when ``subsample`` is set. Controls the randomness + of the subsampling. Pass an int for reproducible output across + multiple function calls. See :term:`Glossary `. + Returns ------- core_samples : ndarray of shape (n_core_samples,) @@ -174,7 +193,9 @@ def dbscan( p=p, n_jobs=n_jobs, ) - est.fit(X, sample_weight=sample_weight) + est.fit( + X, sample_weight=sample_weight, subsample=subsample, random_state=random_state + ) return est.core_sample_indices_, est.labels_ @@ -295,6 +316,9 @@ class DBSCAN(ClusterMixin, BaseEstimator): Another way to reduce memory and computation time is to remove (near-)duplicate points and use ``sample_weight`` instead. + Yet another way is to use ``subsample`` in order to reduce the core + samples search space. + :class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower memory usage. @@ -363,7 +387,7 @@ def __init__( # DBSCAN.metric is not validated yet prefer_skip_nested_validation=False ) - def fit(self, X, y=None, sample_weight=None): + def fit(self, X, y=None, sample_weight=None, subsample=None, random_state=None): """Perform DBSCAN clustering from features, or distance matrix. Parameters @@ -383,6 +407,20 @@ def fit(self, X, y=None, sample_weight=None): negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. + subsample : float, default=None + Should be between [0, 1]. By default, no sampling is done. + Sampling probability, representing the proportion of the dataset + that can be labeled a core sample. The lower the subsample, the + less memory and computation is used. + See: Jang, J. and Jiang, H. "DBSCAN++: Towards fast and scalable + density clustering". Proceedings of the 36th International Conference + on Machine Learning, 2019. + + random_state : int, RandomState instance or None, default=None + Only relevant when ``subsample`` is set. Controls the randomness + of the subsampling. Pass an int for reproducible output across + multiple function calls. See :term:`Glossary `. + Returns ------- self : object @@ -392,6 +430,9 @@ def fit(self, X, y=None, sample_weight=None): if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) + if subsample is not None: + if not isinstance(subsample, Number) or not 0 < subsample < 1: + raise ValueError("Subsample needs to be float between 0 and 1.") # Calculate neighborhood for all samples. This leaves the original # point in, which needs to be considered later (i.e. point i is in the @@ -413,19 +454,40 @@ def fit(self, X, y=None, sample_weight=None): p=self.p, n_jobs=self.n_jobs, ) + + n = X.shape[0] neighbors_model.fit(X) - # This has worst case O(n^2) memory complexity - neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False) + + if subsample: + rng = check_random_state(random_state) + mask = np.full(n, False) + mask[: int(n * subsample)] = True + rng.shuffle(mask) + neighborhoods = np.full(n, None) + neighborhoods[mask] = neighbors_model.radius_neighbors( + X[mask], return_distance=False + ) + else: + # This has worst case O(n^2) memory complexity + neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False) if sample_weight is None: - n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) + n_neighbors = np.array( + [ + 0 if neighbors is None else len(neighbors) + for neighbors in neighborhoods + ] + ) else: n_neighbors = np.array( - [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods] + [ + 0 if neighbors is None else np.sum(sample_weight[neighbors]) + for neighbors in neighborhoods + ] ) # Initially, all samples are noise. - labels = np.full(X.shape[0], -1, dtype=np.intp) + labels = np.full(n, -1, dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8) @@ -442,7 +504,9 @@ def fit(self, X, y=None, sample_weight=None): self.components_ = np.empty((0, X.shape[1])) return self - def fit_predict(self, X, y=None, sample_weight=None): + def fit_predict( + self, X, y=None, sample_weight=None, subsample=None, random_state=None + ): """Compute clusters from a data or distance matrix and predict labels. Parameters @@ -462,12 +526,31 @@ def fit_predict(self, X, y=None, sample_weight=None): negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. + subsample : float, default=None + Should be between [0, 1]. By default, no sampling is done. + Sampling probability, representing the proportion of the dataset + that can be labeled a core sample. The lower the subsample, the + less memory and computation is used. + See: Jang, J. and Jiang, H. "DBSCAN++: Towards fast and scalable + density clustering". Proceedings of the 36th International Conference + on Machine Learning, 2019. + + random_state : int, RandomState instance or None, default=None + Only relevant when ``subsample`` is set. Controls the randomness + of the subsampling. Pass an int for reproducible output across + multiple function calls. See :term:`Glossary `. + Returns ------- labels : ndarray of shape (n_samples,) Cluster labels. Noisy samples are given the label -1. """ - self.fit(X, sample_weight=sample_weight) + self.fit( + X, + sample_weight=sample_weight, + subsample=subsample, + random_state=random_state, + ) return self.labels_ def __sklearn_tags__(self): diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index 556f89312d2fc..6b01710eaca31 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -373,6 +373,41 @@ def test_weighted_dbscan(global_random_seed): assert_array_equal(label1, est.labels_) +def test_subsampled_dbscan(global_random_seed): + # ensure subsample is validated + with pytest.raises(ValueError): + dbscan([[0], [1]], subsample=1.1) + with pytest.raises(ValueError): + dbscan([[0], [1]], subsample=0) + with pytest.raises(ValueError): + dbscan([[0], [1]], subsample=-0.1) + with pytest.raises(ValueError): + dbscan([[0], [1]], subsample="") + + # ensure subsample has an effect + core1, label1 = dbscan(X, subsample=0.1, random_state=global_random_seed) + core2 = dbscan(X, subsample=None)[0] + assert len(core1) != len(core2) + + # subsample should work with precomputed distance matrix + D = pairwise_distances(X) + core3 = dbscan( + D, subsample=0.1, random_state=global_random_seed, metric="precomputed" + )[0] + assert_array_equal(core1, core3) + + # subsample should work with estimator + est = DBSCAN().fit(X, subsample=0.1, random_state=global_random_seed) + assert_array_equal(core1, est.core_sample_indices_) + assert_array_equal(label1, est.labels_) + + est = DBSCAN() + label4 = est.fit_predict(X, subsample=0.1, random_state=global_random_seed) + assert_array_equal(core1, est.core_sample_indices_) + assert_array_equal(label1, label4) + assert_array_equal(label1, est.labels_) + + @pytest.mark.parametrize("algorithm", ["brute", "kd_tree", "ball_tree"]) def test_dbscan_core_samples_toy(algorithm): X = [[0], [2], [3], [4], [6], [8], [10]]