Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG] FIX/TST boundary cases in dbscan #4073

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions sklearn/cluster/dbscan_.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',

min_samples : int, optional
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point.
to be considered as a core point. This number is inclusive of the
core point.

metric : string, or callable
The metric to use when calculating distance between instances in a
Expand Down Expand Up @@ -122,7 +123,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
labels = -np.ones(X.shape[0], dtype=np.int)

# A list of all core samples found.
core_samples = np.flatnonzero(n_neighbors > min_samples)
core_samples = np.flatnonzero(n_neighbors >= min_samples)
index_order = core_samples[random_state.permutation(core_samples.shape[0])]

# label_num is the label given to the new cluster
Expand Down Expand Up @@ -170,7 +171,8 @@ class DBSCAN(BaseEstimator, ClusterMixin):
as in the same neighborhood.
min_samples : int, optional
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point.
to be considered as a core point. This number is inclusive of the
core point.
metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
Expand Down Expand Up @@ -234,8 +236,8 @@ def fit(self, X, y=None, sample_weight=None):
A feature array, or array of distances between samples if
``metric='precomputed'``.
sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with weight greater
than ``min_samples`` is automatically a core sample; a sample with
Weight of each sample, such that a sample with weight at least
``min_samples`` is automatically a core sample; a sample with
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
"""
Expand All @@ -260,8 +262,8 @@ def fit_predict(self, X, y=None, sample_weight=None):
A feature array, or array of distances between samples if
``metric='precomputed'``.
sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with weight greater
than ``min_samples`` is automatically a core sample; a sample with
Weight of each sample, such that a sample with weight at least
``min_samples`` is automatically a core sample; a sample with
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.

Expand Down
37 changes: 25 additions & 12 deletions sklearn/cluster/tests/test_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_in
from sklearn.utils.testing import assert_not_in
from sklearn.cluster.dbscan_ import DBSCAN
from sklearn.cluster.dbscan_ import dbscan
from sklearn.cluster.tests.common import generate_clustered_data
Expand Down Expand Up @@ -79,11 +81,11 @@ def test_dbscan_sparse():

def test_dbscan_no_core_samples():
rng = np.random.RandomState(0)
X = rng.rand(40, 10)
X = rng.rand(15, 10)
X[X < .8] = 0

for X_ in [X, sparse.csr_matrix(X)]:
db = DBSCAN().fit(X_)
db = DBSCAN(min_samples=50).fit(X_)
assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
assert_array_equal(db.labels_, -1)
assert_equal(db.core_sample_indices_.shape, (0,))
Expand Down Expand Up @@ -185,33 +187,44 @@ def test_pickle():
assert_equal(type(pickle.loads(s)), obj.__class__)


def test_boundaries():
# ensure min_samples is inclusive of core point
core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
assert_in(0, core)
# ensure eps is inclusive of circumference
core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a stupid question but why do you need [1] twice?

assert_in(0, core)
core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
assert_not_in(0, core)


def test_weighted_dbscan():
# ensure sample_weight is validated
assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2])
assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4])

# ensure sample_weight has an effect
assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
min_samples=5)[0])
min_samples=6)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
min_samples=5)[0])
min_samples=6)[0])
assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
min_samples=5)[0])
min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
min_samples=5)[0])
min_samples=6)[0])

# points within eps of each other:
assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
sample_weight=[5, 1], min_samples=5)[0])
sample_weight=[5, 1], min_samples=6)[0])
# and effect of non-positive and non-integer sample_weight:
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
eps=1.5, min_samples=5)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5, 0.1],
eps=1.5, min_samples=5)[0])
eps=1.5, min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1],
eps=1.5, min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
eps=1.5, min_samples=5)[0])
eps=1.5, min_samples=6)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
eps=1.5, min_samples=5)[0])
eps=1.5, min_samples=6)[0])

# for non-negative sample_weight, cores should be identical to repetition
rng = np.random.RandomState(42)
Expand Down