Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG + 1] Do not shuffle by default for DBSCAN. #4066

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 34 additions & 21 deletions sklearn/cluster/dbscan_.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
#
# License: BSD 3 clause

import warnings

import numpy as np

from ..base import BaseEstimator, ClusterMixin
from ..metrics import pairwise_distances
from ..utils import check_random_state, check_array, check_consistent_length
from ..utils import check_array, check_consistent_length
from ..neighbors import NearestNeighbors


Expand All @@ -34,7 +36,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',

min_samples : int, optional
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point.
to be considered as a core point. This includes the point itself.

metric : string, or callable
The metric to use when calculating distance between instances in a
Expand All @@ -60,13 +62,14 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
between points.

sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with weight greater
than ``min_samples`` is automatically a core sample; a sample with
negative weight may inhibit its eps-neighbor from being core.
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with negative
weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.

random_state: numpy.RandomState, optional
The generator used to shuffle the samples. Defaults to numpy.random.
Ignored, will be removed in the next version.
(DBSCAN does not use random initialization).

Returns
-------
Expand All @@ -80,6 +83,10 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
-----
See examples/cluster/plot_dbscan.py for an example.

This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n).

References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
Expand All @@ -89,15 +96,16 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
"""
if not eps > 0.0:
raise ValueError("eps must be positive.")
if random_state is not None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be a deprecation warning and should say that it will be removed in 0.18, I think.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment needs to be addressed before merging.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed this has not been addressed yet.

warnings.warn("The parameter random_state is ignored " +
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: there is no need for the + sign here.

"and will be removed in the next version. " +
"(DBSCAN is deterministic except for rare border cases)")

X = check_array(X, accept_sparse='csr')
if sample_weight is not None:
sample_weight = np.asarray(sample_weight)
check_consistent_length(X, sample_weight)

# If index order not given, create random order.
random_state = check_random_state(random_state)

# Calculate neighborhood for all samples. This leaves the original point
# in, which needs to be considered later (i.e. point i is in the
# neighborhood of point i. While True, its useless information)
Expand All @@ -109,6 +117,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
leaf_size=leaf_size,
metric=metric, p=p)
neighbors_model.fit(X)
# This has worst case O(n^2) memory complexity
neighborhoods = neighbors_model.radius_neighbors(X, eps,
return_distance=False)
neighborhoods = np.array(neighborhoods)
Expand All @@ -122,15 +131,14 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
labels = -np.ones(X.shape[0], dtype=np.int)

# A list of all core samples found.
core_samples = np.flatnonzero(n_neighbors > min_samples)
index_order = core_samples[random_state.permutation(core_samples.shape[0])]
core_samples = np.flatnonzero(n_neighbors >= min_samples)

# label_num is the label given to the new cluster
label_num = 0

# Look at all samples and determine if they are core.
# If they are then build a new cluster from them.
for index in index_order:
for index in core_samples:
# Already classified
if labels[index] != -1:
continue
Expand Down Expand Up @@ -170,16 +178,14 @@ class DBSCAN(BaseEstimator, ClusterMixin):
as in the same neighborhood.
min_samples : int, optional
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point.
to be considered as a core point. This includes the point itself.
metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by metrics.pairwise.calculate_distance for its
metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square.
random_state : numpy.RandomState, optional
The generator used to shuffle the samples. Defaults to numpy.random.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
Expand All @@ -189,6 +195,9 @@ class DBSCAN(BaseEstimator, ClusterMixin):
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.
random_state: numpy.RandomState, optional
Ignored, will be removed in the next version.
(DBSCAN does not use random initialization).

Attributes
----------
Expand All @@ -206,6 +215,10 @@ class DBSCAN(BaseEstimator, ClusterMixin):
-----
See examples/cluster/plot_dbscan.py for an example.

This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n).

References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
Expand Down Expand Up @@ -234,9 +247,9 @@ def fit(self, X, y=None, sample_weight=None):
A feature array, or array of distances between samples if
``metric='precomputed'``.
sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with weight greater
than ``min_samples`` is automatically a core sample; a sample with
negative weight may inhibit its eps-neighbor from being core.
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with negative
weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
"""
X = check_array(X, accept_sparse='csr')
Expand All @@ -260,9 +273,9 @@ def fit_predict(self, X, y=None, sample_weight=None):
A feature array, or array of distances between samples if
``metric='precomputed'``.
sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with weight greater
than ``min_samples`` is automatically a core sample; a sample with
negative weight may inhibit its eps-neighbor from being core.
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with negative
weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.

Returns
Expand Down
35 changes: 24 additions & 11 deletions sklearn/cluster/tests/test_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_in
from sklearn.utils.testing import assert_not_in
from sklearn.cluster.dbscan_ import DBSCAN
from sklearn.cluster.dbscan_ import dbscan
from sklearn.cluster.tests.common import generate_clustered_data
Expand Down Expand Up @@ -83,7 +85,7 @@ def test_dbscan_no_core_samples():
X[X < .8] = 0

for X_ in [X, sparse.csr_matrix(X)]:
db = DBSCAN().fit(X_)
db = DBSCAN(min_samples=6).fit(X_)
assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
assert_array_equal(db.labels_, -1)
assert_equal(db.core_sample_indices_.shape, (0,))
Expand Down Expand Up @@ -185,33 +187,44 @@ def test_pickle():
assert_equal(type(pickle.loads(s)), obj.__class__)


def test_boundaries():
# ensure min_samples is inclusive of core point
core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
assert_in(0, core)
# ensure eps is inclusive of circumference
core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
assert_in(0, core)
core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
assert_not_in(0, core)


def test_weighted_dbscan():
# ensure sample_weight is validated
assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2])
assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4])

# ensure sample_weight has an effect
assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
min_samples=5)[0])
min_samples=6)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
min_samples=5)[0])
min_samples=6)[0])
assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
min_samples=5)[0])
min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
min_samples=5)[0])
min_samples=6)[0])

# points within eps of each other:
assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
sample_weight=[5, 1], min_samples=5)[0])
sample_weight=[5, 1], min_samples=6)[0])
# and effect of non-positive and non-integer sample_weight:
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
eps=1.5, min_samples=5)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5, 0.1],
eps=1.5, min_samples=5)[0])
eps=1.5, min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1],
eps=1.5, min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
eps=1.5, min_samples=5)[0])
eps=1.5, min_samples=6)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
eps=1.5, min_samples=5)[0])
eps=1.5, min_samples=6)[0])

# for non-negative sample_weight, cores should be identical to repetition
rng = np.random.RandomState(42)
Expand Down