From 5cd2357cc591abc9fe65c2658e171d3eff661a51 Mon Sep 17 00:00:00 2001 From: Naoya Kanai Date: Sun, 1 Jan 2017 01:57:55 -0800 Subject: [PATCH 1/2] Add DBSCAN support for additional metric params --- sklearn/cluster/dbscan_.py | 19 +++++++++++++--- sklearn/cluster/tests/test_dbscan.py | 34 ++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index ed79546d73eb9..54d3839db6ae6 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -20,7 +20,7 @@ from ._dbscan_inner import dbscan_inner -def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', +def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1): """Perform DBSCAN clustering from vector array or distance matrix. @@ -50,6 +50,11 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. + metric_params : dict, optional (default = None) + Additional keyword arguments for the metric function. + + .. versionadded:: 0.19 + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. @@ -130,7 +135,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', else: neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, - metric=metric, p=p, + metric=metric, + metric_params=metric_params, p=p, n_jobs=n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity @@ -184,6 +190,11 @@ class DBSCAN(BaseEstimator, ClusterMixin): .. versionadded:: 0.17 metric *precomputed* to accept precomputed sparse matrix. + metric_params : dict, optional (default = None) + Additional keyword arguments for the metric function. + + .. versionadded:: 0.19 + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. @@ -237,10 +248,12 @@ class DBSCAN(BaseEstimator, ClusterMixin): """ def __init__(self, eps=0.5, min_samples=5, metric='euclidean', - algorithm='auto', leaf_size=30, p=None, n_jobs=1): + metric_params=None, algorithm='auto', leaf_size=30, p=None, + n_jobs=1): self.eps = eps self.min_samples = min_samples self.metric = metric + self.metric_params = metric_params self.algorithm = algorithm self.leaf_size = leaf_size self.p = p diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index afddf52b03ae8..c051ca364a439 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -133,6 +133,40 @@ def test_dbscan_callable(): assert_equal(n_clusters_2, n_clusters) +def test_dbscan_metric_params(): + # Tests that DBSCAN works with the metrics_params argument. + # Parameters chosen specifically for this task. + # Different eps to other test, because distance is not normalised. + eps = 0.8 + min_samples = 10 + p = 2 + # Compute DBSCAN + # parameters chosen for task + core_samples, labels = dbscan(X, metric='minkowski', + metric_params={'p': p}, eps=eps, + min_samples=min_samples, + algorithm='ball_tree') + + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert_equal(n_clusters_1, n_clusters) + + db = DBSCAN(metric='minkowski', eps=eps, min_samples=min_samples, + algorithm='ball_tree', p=p) + labels = db.fit(X).labels_ + + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert_equal(n_clusters_2, n_clusters) + + # Minkowski with p=2 should use Euclidean + db = DBSCAN(metric='euclidean', eps=eps, min_samples=min_samples, + algorithm='ball_tree') + labels = db.fit(X).labels_ + + n_clusters_3 = len(set(labels)) - int(-1 in labels) + assert_equal(n_clusters_3, n_clusters) + + def test_dbscan_balltree(): # Tests the DBSCAN algorithm with balltree for neighbor calculation. eps = 0.8 From c19173f9269e6db3f635acff24824f84f8ac3984 Mon Sep 17 00:00:00 2001 From: Naoya Kanai Date: Sun, 1 Jan 2017 04:49:05 -0800 Subject: [PATCH 2/2] Improve test for DBSCAN metric_params --- sklearn/cluster/dbscan_.py | 8 +++--- sklearn/cluster/tests/test_dbscan.py | 38 ++++++++++++---------------- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index 54d3839db6ae6..a02db3feafb00 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -50,7 +50,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. - metric_params : dict, optional (default = None) + metric_params : dict, optional Additional keyword arguments for the metric function. .. versionadded:: 0.19 @@ -190,10 +190,10 @@ class DBSCAN(BaseEstimator, ClusterMixin): .. versionadded:: 0.17 metric *precomputed* to accept precomputed sparse matrix. - metric_params : dict, optional (default = None) - Additional keyword arguments for the metric function. + metric_params : dict, optional + Additional keyword arguments for the metric function. - .. versionadded:: 0.19 + .. versionadded:: 0.19 algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index c051ca364a439..b4b34dcefb822 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -135,36 +135,30 @@ def test_dbscan_callable(): def test_dbscan_metric_params(): # Tests that DBSCAN works with the metrics_params argument. - # Parameters chosen specifically for this task. - # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 - p = 2 - # Compute DBSCAN - # parameters chosen for task - core_samples, labels = dbscan(X, metric='minkowski', - metric_params={'p': p}, eps=eps, - min_samples=min_samples, - algorithm='ball_tree') + p = 1 - # number of clusters, ignoring noise if present - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert_equal(n_clusters_1, n_clusters) + # Compute DBSCAN with metric_params arg + db = DBSCAN(metric='minkowski', metric_params={'p': p}, eps=eps, + min_samples=min_samples, algorithm='ball_tree').fit(X) + core_sample_1, labels_1 = db.core_sample_indices_, db.labels_ + # Test that sample labels are the same as passing Minkowski 'p' directly db = DBSCAN(metric='minkowski', eps=eps, min_samples=min_samples, - algorithm='ball_tree', p=p) - labels = db.fit(X).labels_ + algorithm='ball_tree', p=p).fit(X) + core_sample_2, labels_2 = db.core_sample_indices_, db.labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert_equal(n_clusters_2, n_clusters) + assert_array_equal(core_sample_1, core_sample_2) + assert_array_equal(labels_1, labels_2) - # Minkowski with p=2 should use Euclidean - db = DBSCAN(metric='euclidean', eps=eps, min_samples=min_samples, - algorithm='ball_tree') - labels = db.fit(X).labels_ + # Minkowski with p=1 should be equivalent to Manhattan distance + db = DBSCAN(metric='manhattan', eps=eps, min_samples=min_samples, + algorithm='ball_tree').fit(X) + core_sample_3, labels_3 = db.core_sample_indices_, db.labels_ - n_clusters_3 = len(set(labels)) - int(-1 in labels) - assert_equal(n_clusters_3, n_clusters) + assert_array_equal(core_sample_1, core_sample_3) + assert_array_equal(labels_1, labels_3) def test_dbscan_balltree():