scikit-learn · agramfort · Dec 25, 2014 · Dec 23, 2014 · Dec 23, 2014 · Dec 24, 2014
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -150,6 +150,9 @@ Enhancements
 
    - Sparse support for :func:`paired_distances`. By `Joel Nothman`_.
 
+   - DBSCAN now supports sparse input and sample weights, and should be
+     faster in general. By `Joel Nothman`_.
+
 Documentation improvements
 ..........................
 

diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
@@ -4,65 +4,73 @@
 """
 
 # Author: Robert Layton <[email protected]>
+#         Joel Nothman <[email protected]>
 #
 # License: BSD 3 clause
 
 import numpy as np
 
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics import pairwise_distances
-from ..utils import check_random_state
+from ..utils import check_random_state, check_array, check_consistent_length
 from ..neighbors import NearestNeighbors
 
 
 def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
-           algorithm='auto', leaf_size=30, p=2, random_state=None):
+           algorithm='auto', leaf_size=30, p=2, sample_weight=None,
+           random_state=None):
     """Perform DBSCAN clustering from vector array or distance matrix.
 
     Parameters
     ----------
-    X: array [n_samples, n_samples] or [n_samples, n_features]
-        Array of distances between samples, or a feature array.
-        The array is treated as a feature array unless the metric is given as
-        'precomputed'.
+    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
+            array of shape (n_samples, n_samples)
+        A feature array, or array of distances between samples if
+        ``metric='precomputed'``.
 
-    eps: float, optional
+    eps : float, optional
         The maximum distance between two samples for them to be considered
         as in the same neighborhood.
 
-    min_samples: int, optional
-        The number of samples in a neighborhood for a point to be considered
-        as a core point.
+    min_samples : int, optional
+        The number of samples (or total weight) in a neighborhood for a point
+        to be considered as a core point.
 
-    metric: string, or callable
+    metric : string, or callable
         The metric to use when calculating distance between instances in a
         feature array. If metric is a string or callable, it must be one of
         the options allowed by metrics.pairwise.pairwise_distances for its
         metric parameter.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square.
 
-    algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
         The algorithm to be used by the NearestNeighbors module
         to compute pointwise distances and find nearest neighbors.
         See NearestNeighbors module documentation for details.
 
-    leaf_size: int, optional (default = 30)
+    leaf_size : int, optional (default = 30)
         Leaf size passed to BallTree or cKDTree. This can affect the speed
         of the construction and query, as well as the memory required
         to store the tree. The optimal value depends
         on the nature of the problem.
 
-    p: float, optional
+    p : float, optional
         The power of the Minkowski metric to be used to calculate distance
         between points.
 
-    random_state: numpy.RandomState, optional
+    sample_weight : array, shape (n_samples,), optional
+        Weight of each sample, such that a sample with weight greater
+        than ``min_samples`` is automatically a core sample; a sample with
+        negative weight may inhibit its eps-neighbor from being core.
+        Note that weights are absolute, and default to 1.
+
+    random_state : numpy.RandomState, optional
         The generator used to initialize the centers. Defaults to numpy.random.
 
     Returns
     -------
-    core_samples: array [n_core_samples]
+    core_samples : array [n_core_samples]
         Indices of core samples.
 
     labels : array [n_samples]
@@ -82,36 +90,40 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
     if not eps > 0.0:
         raise ValueError("eps must be positive.")
 
-    X = np.asarray(X)
-    n = X.shape[0]
+    X = check_array(X, accept_sparse='csr')
+    if sample_weight is not None:
+        sample_weight = np.asarray(sample_weight)
+        check_consistent_length(X, sample_weight)
 
     # If index order not given, create random order.
     random_state = check_random_state(random_state)
-    index_order = random_state.permutation(n)
 
-    # check for known metric powers
-    distance_matrix = True
+    # Calculate neighborhood for all samples. This leaves the original point
+    # in, which needs to be considered later (i.e. point i is in the
+    # neighborhood of point i. While True, its useless information)
     if metric == 'precomputed':
         D = pairwise_distances(X, metric=metric)
+        neighborhoods = [np.where(x <= eps)[0] for x in D]
     else:
-        distance_matrix = False
         neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm,
                                            leaf_size=leaf_size,
                                            metric=metric, p=p)
         neighbors_model.fit(X)
-
-    # Calculate neighborhood for all samples. This leaves the original point
-    # in, which needs to be considered later (i.e. point i is the
-    # neighborhood of point i. While True, its useless information)
-    neighborhoods = []
-    if distance_matrix:
-        neighborhoods = [np.where(x <= eps)[0] for x in D]
+        neighborhoods = neighbors_model.radius_neighbors(X, eps,
+                                                         return_distance=False)
+        neighborhoods = np.array(neighborhoods)
+    if sample_weight is None:
+        n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
+    else:
+        n_neighbors = np.array([np.sum(sample_weight[neighbors])
+                                for neighbors in neighborhoods])
 
     # Initially, all samples are noise.
-    labels = -np.ones(n, dtype=np.int)
+    labels = -np.ones(X.shape[0], dtype=np.int)
 
     # A list of all core samples found.
-    core_samples = []
+    core_samples = np.flatnonzero(n_neighbors > min_samples)
+    index_order = core_samples[random_state.permutation(core_samples.shape[0])]
 
     # label_num is the label given to the new cluster
     label_num = 0
@@ -123,51 +135,19 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
         if labels[index] != -1:
             continue
 
-        # get neighbors from neighborhoods or ballTree
-        index_neighborhood = []
-        if distance_matrix:
-            index_neighborhood = neighborhoods[index]
-        else:
-            index_neighborhood = neighbors_model.radius_neighbors(
-                X[index], eps, return_distance=False)[0]
-
-        # Too few samples to be core
-        if len(index_neighborhood) < min_samples:
-            continue
-
-        core_samples.append(index)
         labels[index] = label_num
+
         # candidates for new core samples in the cluster.
         candidates = [index]
-
         while len(candidates) > 0:
-            new_candidates = []
+            cand_neighbors = np.concatenate(np.take(neighborhoods, candidates,
+                                                    axis=0).tolist())
+            cand_neighbors = np.unique(cand_neighbors)
+            noise = cand_neighbors[labels.take(cand_neighbors) == -1]
+            labels[noise] = label_num
             # A candidate is a core point in the current cluster that has
             # not yet been used to expand the current cluster.
-            for c in candidates:
-                c_neighborhood = []
-                if distance_matrix:
-                    c_neighborhood = neighborhoods[c]
-                else:
-                    c_neighborhood = neighbors_model.radius_neighbors(
-                        X[c], eps, return_distance=False)[0]
-                noise = np.where(labels[c_neighborhood] == -1)[0]
-                noise = c_neighborhood[noise]
-                labels[noise] = label_num
-                for neighbor in noise:
-                    n_neighborhood = []
-                    if distance_matrix:
-                        n_neighborhood = neighborhoods[neighbor]
-                    else:
-                        n_neighborhood = neighbors_model.radius_neighbors(
-                            X[neighbor], eps, return_distance=False)[0]
-                    # check if its a core point as well
-                    if len(n_neighborhood) >= min_samples:
-                        # is new core point
-                        new_candidates.append(neighbor)
-                        core_samples.append(neighbor)
-            # Update candidates for next round of cluster expansion.
-            candidates = new_candidates
+            candidates = np.intersect1d(noise, core_samples)
         # Current cluster finished.
         # Next core point found will start a new cluster.
         label_num += 1
@@ -187,8 +167,8 @@ class DBSCAN(BaseEstimator, ClusterMixin):
         The maximum distance between two samples for them to be considered
         as in the same neighborhood.
     min_samples : int, optional
-        The number of samples in a neighborhood for a point to be considered
-        as a core point.
+        The number of samples (or total weight) in a neighborhood for a point
+        to be considered as a core point.
     metric : string, or callable
         The metric to use when calculating distance between instances in a
         feature array. If metric is a string or callable, it must be one of
@@ -233,20 +213,46 @@ def __init__(self, eps=0.5, min_samples=5, metric='euclidean',
         self.p = p
         self.random_state = random_state
 
-    def fit(self, X):
+    def fit(self, X, sample_weight=None):
         """Perform DBSCAN clustering from features or distance matrix.
 
         Parameters
         ----------
-        X: array [n_samples, n_samples] or [n_samples, n_features]
-            Array of distances between samples, or a feature array.
-            The array is treated as a feature array unless the metric is
-            given as 'precomputed'.
-        params: dict
-            Overwrite keywords from __init__.
+        X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
+                array of shape (n_samples, n_samples)
+            A feature array, or array of distances between samples if
+            ``metric='precomputed'``.
+        sample_weight : array, shape (n_samples,), optional
+            Weight of each sample, such that a sample with weight greater
+            than ``min_samples`` is automatically a core sample; a sample with
+            negative weight may inhibit its eps-neighbor from being core.
+            Note that weights are absolute, and default to 1.
         """
-        X = np.asarray(X)
-        clust = dbscan(X, **self.get_params())
+        X = check_array(X, accept_sparse='csr')
+        clust = dbscan(X, sample_weight=sample_weight, **self.get_params())
         self.core_sample_indices_, self.labels_ = clust
         self.components_ = X[self.core_sample_indices_].copy()
         return self
+
+    def fit_predict(self, X, y=None, sample_weight=None):
+        """Performs clustering on X and returns cluster labels.
+
+        Parameters
+        ----------
+        X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
+                array of shape (n_samples, n_samples)
+            A feature array, or array of distances between samples if
+            ``metric='precomputed'``.
+        sample_weight : array, shape (n_samples,), optional
+            Weight of each sample, such that a sample with weight greater
+            than ``min_samples`` is automatically a core sample; a sample with
+            negative weight may inhibit its eps-neighbor from being core.
+            Note that weights are absolute, and default to 1.
+
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            cluster labels
+        """
+        self.fit(X, sample_weight=sample_weight)
+        return self.labels_
diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py
@@ -8,9 +8,13 @@
 from numpy.testing import assert_raises
 
 from scipy.spatial import distance
+from scipy import sparse
 
 from sklearn.utils.testing import assert_equal
-from sklearn.cluster.dbscan_ import DBSCAN, dbscan
+from sklearn.utils.testing import assert_array_equal
+from sklearn.utils.testing import assert_raises
+from sklearn.cluster.dbscan_ import DBSCAN
+from sklearn.cluster.dbscan_ import dbscan
 from .common import generate_clustered_data
 from sklearn.metrics.pairwise import pairwise_distances
 
@@ -65,6 +69,15 @@ def test_dbscan_feature():
     assert_equal(n_clusters_2, n_clusters)
 
 
+def test_dbscan_sparse():
+    core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=.8,
+                                        min_samples=10, random_state=0)
+    core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10,
+                                      random_state=0)
+    assert_array_equal(core_dense, core_sparse)
+    assert_array_equal(labels_dense, labels_sparse)
+
+
 def test_dbscan_callable():
     """Tests the DBSCAN algorithm with a callable metric."""
     # Parameters chosen specifically for this task.
@@ -159,3 +172,67 @@ def test_pickle():
     obj = DBSCAN()
     s = pickle.dumps(obj)
     assert_equal(type(pickle.loads(s)), obj.__class__)
+
+
+def test_weighted_dbscan():
+    # ensure sample_weight is validated
+    assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2])
+    assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4])
+
+    # ensure sample_weight has an effect
+    assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
+                                  min_samples=5)[0])
+    assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
+                                  min_samples=5)[0])
+    assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
+                                   min_samples=5)[0])
+    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
+                                      min_samples=5)[0])
+
+    # points within eps of each other:
+    assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
+                                      sample_weight=[5, 1], min_samples=5)[0])
+    # and effect of non-positive and non-integer sample_weight:
+    assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
+                                  eps=1.5, min_samples=5)[0])
+    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5, 0.1],
+                                      eps=1.5, min_samples=5)[0])
+    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
+                                      eps=1.5, min_samples=5)[0])
+    assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
+                                  eps=1.5, min_samples=5)[0])
+
+    # for non-negative sample_weight, cores should be identical to repetition
+    rng = np.random.RandomState(42)
+    sample_weight = rng.randint(0, 5, X.shape[0])
+    core1, label1 = dbscan(X, sample_weight=sample_weight, random_state=42)
+    assert_equal(len(label1), len(X))
+
+    X_repeated = np.repeat(X, sample_weight, axis=0)
+    core_repeated, label_repeated = dbscan(X_repeated, random_state=42)
+    core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
+    core_repeated_mask[core_repeated] = True
+    core_mask = np.zeros(X.shape[0], dtype=bool)
+    core_mask[core1] = True
+    assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)
+
+    # sample_weight should work with precomputed distance matrix
+    D = pairwise_distances(X)
+    core3, label3 = dbscan(D, sample_weight=sample_weight,
+                           metric='precomputed', random_state=42)
+    assert_array_equal(core1, core3)
+    assert_array_equal(label1, label3)
+
+    # sample_weight should work with estimator
+    est = DBSCAN(random_state=42).fit(X, sample_weight=sample_weight)
+    core4 = est.core_sample_indices_
+    label4 = est.labels_
+    assert_array_equal(core1, core4)
+    assert_array_equal(label1, label4)
+
+    est = DBSCAN(random_state=42)
+    label5 = est.fit_predict(X, sample_weight=sample_weight)
+    core5 = est.core_sample_indices_
+    assert_array_equal(core1, core5)
+    assert_array_equal(label1, label5)
+    assert_array_equal(label1, est.labels_)