Thanks to visit codestin.com
Credit goes to github.com

Skip to content

DBSCAN++: Run DBSCAN on 100x larger datasets, up to 100x faster in subsampling #30523

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 93 additions & 10 deletions sklearn/cluster/_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
# SPDX-License-Identifier: BSD-3-Clause

import warnings
from numbers import Integral, Real
from numbers import Integral, Number, Real

import numpy as np
from scipy import sparse

from ..base import BaseEstimator, ClusterMixin, _fit_context
from ..metrics.pairwise import _VALID_METRICS
from ..neighbors import NearestNeighbors
from ..utils import check_random_state
from ..utils._param_validation import Interval, StrOptions, validate_params
from ..utils.validation import _check_sample_weight, validate_data
from ._dbscan_inner import dbscan_inner
Expand All @@ -23,6 +24,8 @@
{
"X": ["array-like", "sparse matrix"],
"sample_weight": ["array-like", None],
"subsample": [Interval(Real, 0, 1, closed="neither"), None],
"random_state": ["random_state"],
},
prefer_skip_nested_validation=False,
)
Expand All @@ -38,6 +41,8 @@
p=2,
sample_weight=None,
n_jobs=None,
subsample=None,
random_state=None,
):
"""Perform DBSCAN clustering from vector array or distance matrix.

Expand Down Expand Up @@ -104,6 +109,20 @@
If precomputed distance are used, parallel execution is not available
and thus n_jobs will have no effect.

subsample : float, default=None
Should be between [0, 1]. By default, no sampling is done.
Sampling probability, representing the proportion of the dataset
that can be labeled a core sample. The lower the subsample, the
less memory and computation is used.
See: Jang, J. and Jiang, H. "DBSCAN++: Towards fast and scalable
density clustering". Proceedings of the 36th International Conference
on Machine Learning, 2019.

random_state : int, RandomState instance or None, default=None
Only relevant when ``subsample`` is set. Controls the randomness
of the subsampling. Pass an int for reproducible output across
multiple function calls. See :term:`Glossary <random_state>`.

Returns
-------
core_samples : ndarray of shape (n_core_samples,)
Expand Down Expand Up @@ -174,7 +193,9 @@
p=p,
n_jobs=n_jobs,
)
est.fit(X, sample_weight=sample_weight)
est.fit(
X, sample_weight=sample_weight, subsample=subsample, random_state=random_state
)
return est.core_sample_indices_, est.labels_


Expand Down Expand Up @@ -295,6 +316,9 @@
Another way to reduce memory and computation time is to remove
(near-)duplicate points and use ``sample_weight`` instead.

Yet another way is to use ``subsample`` in order to reduce the core
samples search space.

:class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower memory
usage.

Expand Down Expand Up @@ -363,7 +387,7 @@
# DBSCAN.metric is not validated yet
prefer_skip_nested_validation=False
)
def fit(self, X, y=None, sample_weight=None):
def fit(self, X, y=None, sample_weight=None, subsample=None, random_state=None):
"""Perform DBSCAN clustering from features, or distance matrix.

Parameters
Expand All @@ -383,6 +407,20 @@
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.

subsample : float, default=None
Should be between [0, 1]. By default, no sampling is done.
Sampling probability, representing the proportion of the dataset
that can be labeled a core sample. The lower the subsample, the
less memory and computation is used.
See: Jang, J. and Jiang, H. "DBSCAN++: Towards fast and scalable
density clustering". Proceedings of the 36th International Conference
on Machine Learning, 2019.

random_state : int, RandomState instance or None, default=None
Only relevant when ``subsample`` is set. Controls the randomness
of the subsampling. Pass an int for reproducible output across
multiple function calls. See :term:`Glossary <random_state>`.

Returns
-------
self : object
Expand All @@ -392,6 +430,9 @@

if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
if subsample is not None:
if not isinstance(subsample, Number) or not 0 < subsample < 1:
raise ValueError("Subsample needs to be float between 0 and 1.")

Check warning on line 435 in sklearn/cluster/_dbscan.py

View check run for this annotation

Codecov / codecov/patch

sklearn/cluster/_dbscan.py#L435

Added line #L435 was not covered by tests

# Calculate neighborhood for all samples. This leaves the original
# point in, which needs to be considered later (i.e. point i is in the
Expand All @@ -413,19 +454,40 @@
p=self.p,
n_jobs=self.n_jobs,
)

n = X.shape[0]
neighbors_model.fit(X)
# This has worst case O(n^2) memory complexity
neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)

if subsample:
rng = check_random_state(random_state)
mask = np.full(n, False)
mask[: int(n * subsample)] = True
rng.shuffle(mask)
neighborhoods = np.full(n, None)
neighborhoods[mask] = neighbors_model.radius_neighbors(
X[mask], return_distance=False
)
else:
# This has worst case O(n^2) memory complexity
neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)

if sample_weight is None:
n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
n_neighbors = np.array(
[
0 if neighbors is None else len(neighbors)
for neighbors in neighborhoods
]
)
else:
n_neighbors = np.array(
[np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]
[
0 if neighbors is None else np.sum(sample_weight[neighbors])
for neighbors in neighborhoods
]
)

# Initially, all samples are noise.
labels = np.full(X.shape[0], -1, dtype=np.intp)
labels = np.full(n, -1, dtype=np.intp)

# A list of all core samples found.
core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)
Expand All @@ -442,7 +504,9 @@
self.components_ = np.empty((0, X.shape[1]))
return self

def fit_predict(self, X, y=None, sample_weight=None):
def fit_predict(
self, X, y=None, sample_weight=None, subsample=None, random_state=None
):
"""Compute clusters from a data or distance matrix and predict labels.

Parameters
Expand All @@ -462,12 +526,31 @@
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.

subsample : float, default=None
Should be between [0, 1]. By default, no sampling is done.
Sampling probability, representing the proportion of the dataset
that can be labeled a core sample. The lower the subsample, the
less memory and computation is used.
See: Jang, J. and Jiang, H. "DBSCAN++: Towards fast and scalable
density clustering". Proceedings of the 36th International Conference
on Machine Learning, 2019.

random_state : int, RandomState instance or None, default=None
Only relevant when ``subsample`` is set. Controls the randomness
of the subsampling. Pass an int for reproducible output across
multiple function calls. See :term:`Glossary <random_state>`.

Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels. Noisy samples are given the label -1.
"""
self.fit(X, sample_weight=sample_weight)
self.fit(
X,
sample_weight=sample_weight,
subsample=subsample,
random_state=random_state,
)
return self.labels_

def __sklearn_tags__(self):
Expand Down
35 changes: 35 additions & 0 deletions sklearn/cluster/tests/test_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,41 @@ def test_weighted_dbscan(global_random_seed):
assert_array_equal(label1, est.labels_)


def test_subsampled_dbscan(global_random_seed):
# ensure subsample is validated
with pytest.raises(ValueError):
dbscan([[0], [1]], subsample=1.1)
with pytest.raises(ValueError):
dbscan([[0], [1]], subsample=0)
with pytest.raises(ValueError):
dbscan([[0], [1]], subsample=-0.1)
with pytest.raises(ValueError):
dbscan([[0], [1]], subsample="")

# ensure subsample has an effect
core1, label1 = dbscan(X, subsample=0.1, random_state=global_random_seed)
core2 = dbscan(X, subsample=None)[0]
assert len(core1) != len(core2)

# subsample should work with precomputed distance matrix
D = pairwise_distances(X)
core3 = dbscan(
D, subsample=0.1, random_state=global_random_seed, metric="precomputed"
)[0]
assert_array_equal(core1, core3)

# subsample should work with estimator
est = DBSCAN().fit(X, subsample=0.1, random_state=global_random_seed)
assert_array_equal(core1, est.core_sample_indices_)
assert_array_equal(label1, est.labels_)

est = DBSCAN()
label4 = est.fit_predict(X, subsample=0.1, random_state=global_random_seed)
assert_array_equal(core1, est.core_sample_indices_)
assert_array_equal(label1, label4)
assert_array_equal(label1, est.labels_)


@pytest.mark.parametrize("algorithm", ["brute", "kd_tree", "ball_tree"])
def test_dbscan_core_samples_toy(algorithm):
X = [[0], [2], [3], [4], [6], [8], [10]]
Expand Down
Loading