Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG+1] DBSCAN: faster, weighted samples, and sparse input #3994

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 25, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ Enhancements

- Sparse support for :func:`paired_distances`. By `Joel Nothman`_.

- DBSCAN now supports sparse input and sample weights, and should be
faster in general. By `Joel Nothman`_.

Documentation improvements
..........................

Expand Down
168 changes: 87 additions & 81 deletions sklearn/cluster/dbscan_.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,65 +4,73 @@
"""

# Author: Robert Layton <[email protected]>
# Joel Nothman <[email protected]>
#
# License: BSD 3 clause

import numpy as np

from ..base import BaseEstimator, ClusterMixin
from ..metrics import pairwise_distances
from ..utils import check_random_state
from ..utils import check_random_state, check_array, check_consistent_length
from ..neighbors import NearestNeighbors


def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
algorithm='auto', leaf_size=30, p=2, random_state=None):
algorithm='auto', leaf_size=30, p=2, sample_weight=None,
random_state=None):
"""Perform DBSCAN clustering from vector array or distance matrix.

Parameters
----------
X: array [n_samples, n_samples] or [n_samples, n_features]
Array of distances between samples, or a feature array.
The array is treated as a feature array unless the metric is given as
'precomputed'.
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
array of shape (n_samples, n_samples)
A feature array, or array of distances between samples if
``metric='precomputed'``.

eps: float, optional
eps : float, optional
The maximum distance between two samples for them to be considered
as in the same neighborhood.

min_samples: int, optional
The number of samples in a neighborhood for a point to be considered
as a core point.
min_samples : int, optional
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point.

metric: string, or callable
metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by metrics.pairwise.pairwise_distances for its
metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square.

algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
See NearestNeighbors module documentation for details.

leaf_size: int, optional (default = 30)
leaf_size : int, optional (default = 30)
Leaf size passed to BallTree or cKDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.

p: float, optional
p : float, optional
The power of the Minkowski metric to be used to calculate distance
between points.

random_state: numpy.RandomState, optional
sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with weight greater
than ``min_samples`` is automatically a core sample; a sample with
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.

random_state : numpy.RandomState, optional
The generator used to initialize the centers. Defaults to numpy.random.

Returns
-------
core_samples: array [n_core_samples]
core_samples : array [n_core_samples]
Indices of core samples.

labels : array [n_samples]
Expand All @@ -82,36 +90,40 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
if not eps > 0.0:
raise ValueError("eps must be positive.")

X = np.asarray(X)
n = X.shape[0]
X = check_array(X, accept_sparse='csr')
if sample_weight is not None:
sample_weight = np.asarray(sample_weight)
check_consistent_length(X, sample_weight)

# If index order not given, create random order.
random_state = check_random_state(random_state)
index_order = random_state.permutation(n)

# check for known metric powers
distance_matrix = True
# Calculate neighborhood for all samples. This leaves the original point
# in, which needs to be considered later (i.e. point i is in the
# neighborhood of point i. While True, its useless information)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

point i is in the neighborhood of point i. While true, it is useless information).

if metric == 'precomputed':
D = pairwise_distances(X, metric=metric)
neighborhoods = [np.where(x <= eps)[0] for x in D]
else:
distance_matrix = False
neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm,
leaf_size=leaf_size,
metric=metric, p=p)
neighbors_model.fit(X)

# Calculate neighborhood for all samples. This leaves the original point
# in, which needs to be considered later (i.e. point i is the
# neighborhood of point i. While True, its useless information)
neighborhoods = []
if distance_matrix:
neighborhoods = [np.where(x <= eps)[0] for x in D]
neighborhoods = neighbors_model.radius_neighbors(X, eps,
return_distance=False)
neighborhoods = np.array(neighborhoods)
if sample_weight is None:
n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
else:
n_neighbors = np.array([np.sum(sample_weight[neighbors])
for neighbors in neighborhoods])

# Initially, all samples are noise.
labels = -np.ones(n, dtype=np.int)
labels = -np.ones(X.shape[0], dtype=np.int)

# A list of all core samples found.
core_samples = []
core_samples = np.flatnonzero(n_neighbors > min_samples)
index_order = core_samples[random_state.permutation(core_samples.shape[0])]

# label_num is the label given to the new cluster
label_num = 0
Expand All @@ -123,51 +135,19 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
if labels[index] != -1:
continue

# get neighbors from neighborhoods or ballTree
index_neighborhood = []
if distance_matrix:
index_neighborhood = neighborhoods[index]
else:
index_neighborhood = neighbors_model.radius_neighbors(
X[index], eps, return_distance=False)[0]

# Too few samples to be core
if len(index_neighborhood) < min_samples:
continue

core_samples.append(index)
labels[index] = label_num

# candidates for new core samples in the cluster.
candidates = [index]

while len(candidates) > 0:
new_candidates = []
cand_neighbors = np.concatenate(np.take(neighborhoods, candidates,
axis=0).tolist())
cand_neighbors = np.unique(cand_neighbors)
noise = cand_neighbors[labels.take(cand_neighbors) == -1]
labels[noise] = label_num
# A candidate is a core point in the current cluster that has
# not yet been used to expand the current cluster.
for c in candidates:
c_neighborhood = []
if distance_matrix:
c_neighborhood = neighborhoods[c]
else:
c_neighborhood = neighbors_model.radius_neighbors(
X[c], eps, return_distance=False)[0]
noise = np.where(labels[c_neighborhood] == -1)[0]
noise = c_neighborhood[noise]
labels[noise] = label_num
for neighbor in noise:
n_neighborhood = []
if distance_matrix:
n_neighborhood = neighborhoods[neighbor]
else:
n_neighborhood = neighbors_model.radius_neighbors(
X[neighbor], eps, return_distance=False)[0]
# check if its a core point as well
if len(n_neighborhood) >= min_samples:
# is new core point
new_candidates.append(neighbor)
core_samples.append(neighbor)
# Update candidates for next round of cluster expansion.
candidates = new_candidates
candidates = np.intersect1d(noise, core_samples)
# Current cluster finished.
# Next core point found will start a new cluster.
label_num += 1
Expand All @@ -187,8 +167,8 @@ class DBSCAN(BaseEstimator, ClusterMixin):
The maximum distance between two samples for them to be considered
as in the same neighborhood.
min_samples : int, optional
The number of samples in a neighborhood for a point to be considered
as a core point.
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point.
metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
Expand Down Expand Up @@ -233,20 +213,46 @@ def __init__(self, eps=0.5, min_samples=5, metric='euclidean',
self.p = p
self.random_state = random_state

def fit(self, X):
def fit(self, X, sample_weight=None):
"""Perform DBSCAN clustering from features or distance matrix.

Parameters
----------
X: array [n_samples, n_samples] or [n_samples, n_features]
Array of distances between samples, or a feature array.
The array is treated as a feature array unless the metric is
given as 'precomputed'.
params: dict
Overwrite keywords from __init__.
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
array of shape (n_samples, n_samples)
A feature array, or array of distances between samples if
``metric='precomputed'``.
sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with weight greater
than ``min_samples`` is automatically a core sample; a sample with
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
"""
X = np.asarray(X)
clust = dbscan(X, **self.get_params())
X = check_array(X, accept_sparse='csr')
clust = dbscan(X, sample_weight=sample_weight, **self.get_params())
self.core_sample_indices_, self.labels_ = clust
self.components_ = X[self.core_sample_indices_].copy()
return self

def fit_predict(self, X, y=None, sample_weight=None):
"""Performs clustering on X and returns cluster labels.

Parameters
----------
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
array of shape (n_samples, n_samples)
A feature array, or array of distances between samples if
``metric='precomputed'``.
sample_weight : array, shape (n_samples,), optional
Weight of each sample, such that a sample with weight greater
than ``min_samples`` is automatically a core sample; a sample with
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.

Returns
-------
y : ndarray, shape (n_samples,)
cluster labels
"""
self.fit(X, sample_weight=sample_weight)
return self.labels_
79 changes: 78 additions & 1 deletion sklearn/cluster/tests/test_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@
from numpy.testing import assert_raises

from scipy.spatial import distance
from scipy import sparse

from sklearn.utils.testing import assert_equal
from sklearn.cluster.dbscan_ import DBSCAN, dbscan
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_raises
from sklearn.cluster.dbscan_ import DBSCAN
from sklearn.cluster.dbscan_ import dbscan
from .common import generate_clustered_data
from sklearn.metrics.pairwise import pairwise_distances

Expand Down Expand Up @@ -65,6 +69,15 @@ def test_dbscan_feature():
assert_equal(n_clusters_2, n_clusters)


def test_dbscan_sparse():
core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=.8,
min_samples=10, random_state=0)
core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10,
random_state=0)
assert_array_equal(core_dense, core_sparse)
assert_array_equal(labels_dense, labels_sparse)


def test_dbscan_callable():
"""Tests the DBSCAN algorithm with a callable metric."""
# Parameters chosen specifically for this task.
Expand Down Expand Up @@ -159,3 +172,67 @@ def test_pickle():
obj = DBSCAN()
s = pickle.dumps(obj)
assert_equal(type(pickle.loads(s)), obj.__class__)


def test_weighted_dbscan():
# ensure sample_weight is validated
assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2])
assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4])

# ensure sample_weight has an effect
assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
min_samples=5)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
min_samples=5)[0])
assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
min_samples=5)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
min_samples=5)[0])

# points within eps of each other:
assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
sample_weight=[5, 1], min_samples=5)[0])
# and effect of non-positive and non-integer sample_weight:
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
eps=1.5, min_samples=5)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5, 0.1],
eps=1.5, min_samples=5)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
eps=1.5, min_samples=5)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
eps=1.5, min_samples=5)[0])

# for non-negative sample_weight, cores should be identical to repetition
rng = np.random.RandomState(42)
sample_weight = rng.randint(0, 5, X.shape[0])
core1, label1 = dbscan(X, sample_weight=sample_weight, random_state=42)
assert_equal(len(label1), len(X))

X_repeated = np.repeat(X, sample_weight, axis=0)
core_repeated, label_repeated = dbscan(X_repeated, random_state=42)
core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
core_repeated_mask[core_repeated] = True
core_mask = np.zeros(X.shape[0], dtype=bool)
core_mask[core1] = True
assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)

# sample_weight should work with precomputed distance matrix
D = pairwise_distances(X)
core3, label3 = dbscan(D, sample_weight=sample_weight,
metric='precomputed', random_state=42)
assert_array_equal(core1, core3)
assert_array_equal(label1, label3)

# sample_weight should work with estimator
est = DBSCAN(random_state=42).fit(X, sample_weight=sample_weight)
core4 = est.core_sample_indices_
label4 = est.labels_
assert_array_equal(core1, core4)
assert_array_equal(label1, label4)

est = DBSCAN(random_state=42)
label5 = est.fit_predict(X, sample_weight=sample_weight)
core5 = est.core_sample_indices_
assert_array_equal(core1, core5)
assert_array_equal(label1, label5)
assert_array_equal(label1, est.labels_)