From 5cd2357cc591abc9fe65c2658e171d3eff661a51 Mon Sep 17 00:00:00 2001
From: Naoya Kanai <naopon@gmail.com>
Date: Sun, 1 Jan 2017 01:57:55 -0800
Subject: [PATCH 1/2] Add DBSCAN support for additional metric params

---
 sklearn/cluster/dbscan_.py           | 19 +++++++++++++---
 sklearn/cluster/tests/test_dbscan.py | 34 ++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index ed79546d73eb9..54d3839db6ae6 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -20,7 +20,7 @@
 from ._dbscan_inner import dbscan_inner
 
 
-def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
+def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
            algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1):
     """Perform DBSCAN clustering from vector array or distance matrix.
 
@@ -50,6 +50,11 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
         must be square. X may be a sparse matrix, in which case only "nonzero"
         elements may be considered neighbors for DBSCAN.
 
+    metric_params : dict, optional (default = None)
+        Additional keyword arguments for the metric function.
+
+        .. versionadded:: 0.19
+
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
         The algorithm to be used by the NearestNeighbors module
         to compute pointwise distances and find nearest neighbors.
@@ -130,7 +135,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
     else:
         neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm,
                                            leaf_size=leaf_size,
-                                           metric=metric, p=p,
+                                           metric=metric,
+                                           metric_params=metric_params, p=p,
                                            n_jobs=n_jobs)
         neighbors_model.fit(X)
         # This has worst case O(n^2) memory complexity
@@ -184,6 +190,11 @@ class DBSCAN(BaseEstimator, ClusterMixin):
         .. versionadded:: 0.17
            metric *precomputed* to accept precomputed sparse matrix.
 
+   metric_params : dict, optional (default = None)
+       Additional keyword arguments for the metric function.
+
+       .. versionadded:: 0.19
+
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
         The algorithm to be used by the NearestNeighbors module
         to compute pointwise distances and find nearest neighbors.
@@ -237,10 +248,12 @@ class DBSCAN(BaseEstimator, ClusterMixin):
     """
 
     def __init__(self, eps=0.5, min_samples=5, metric='euclidean',
-                 algorithm='auto', leaf_size=30, p=None, n_jobs=1):
+                 metric_params=None, algorithm='auto', leaf_size=30, p=None,
+                 n_jobs=1):
         self.eps = eps
         self.min_samples = min_samples
         self.metric = metric
+        self.metric_params = metric_params
         self.algorithm = algorithm
         self.leaf_size = leaf_size
         self.p = p
diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py
index afddf52b03ae8..c051ca364a439 100644
--- a/sklearn/cluster/tests/test_dbscan.py
+++ b/sklearn/cluster/tests/test_dbscan.py
@@ -133,6 +133,40 @@ def test_dbscan_callable():
     assert_equal(n_clusters_2, n_clusters)
 
 
+def test_dbscan_metric_params():
+    # Tests that DBSCAN works with the metrics_params argument.
+    # Parameters chosen specifically for this task.
+    # Different eps to other test, because distance is not normalised.
+    eps = 0.8
+    min_samples = 10
+    p = 2
+    # Compute DBSCAN
+    # parameters chosen for task
+    core_samples, labels = dbscan(X, metric='minkowski',
+                                  metric_params={'p': p}, eps=eps,
+                                  min_samples=min_samples,
+                                  algorithm='ball_tree')
+
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(labels)) - int(-1 in labels)
+    assert_equal(n_clusters_1, n_clusters)
+
+    db = DBSCAN(metric='minkowski', eps=eps, min_samples=min_samples,
+                algorithm='ball_tree', p=p)
+    labels = db.fit(X).labels_
+
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
+    assert_equal(n_clusters_2, n_clusters)
+
+    # Minkowski with p=2 should use Euclidean
+    db = DBSCAN(metric='euclidean', eps=eps, min_samples=min_samples,
+                algorithm='ball_tree')
+    labels = db.fit(X).labels_
+
+    n_clusters_3 = len(set(labels)) - int(-1 in labels)
+    assert_equal(n_clusters_3, n_clusters)
+
+
 def test_dbscan_balltree():
     # Tests the DBSCAN algorithm with balltree for neighbor calculation.
     eps = 0.8

From c19173f9269e6db3f635acff24824f84f8ac3984 Mon Sep 17 00:00:00 2001
From: Naoya Kanai <naopon@gmail.com>
Date: Sun, 1 Jan 2017 04:49:05 -0800
Subject: [PATCH 2/2] Improve test for DBSCAN metric_params

---
 sklearn/cluster/dbscan_.py           |  8 +++---
 sklearn/cluster/tests/test_dbscan.py | 38 ++++++++++++----------------
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index 54d3839db6ae6..a02db3feafb00 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -50,7 +50,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
         must be square. X may be a sparse matrix, in which case only "nonzero"
         elements may be considered neighbors for DBSCAN.
 
-    metric_params : dict, optional (default = None)
+    metric_params : dict, optional
         Additional keyword arguments for the metric function.
 
         .. versionadded:: 0.19
@@ -190,10 +190,10 @@ class DBSCAN(BaseEstimator, ClusterMixin):
         .. versionadded:: 0.17
            metric *precomputed* to accept precomputed sparse matrix.
 
-   metric_params : dict, optional (default = None)
-       Additional keyword arguments for the metric function.
+    metric_params : dict, optional
+        Additional keyword arguments for the metric function.
 
-       .. versionadded:: 0.19
+        .. versionadded:: 0.19
 
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
         The algorithm to be used by the NearestNeighbors module
diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py
index c051ca364a439..b4b34dcefb822 100644
--- a/sklearn/cluster/tests/test_dbscan.py
+++ b/sklearn/cluster/tests/test_dbscan.py
@@ -135,36 +135,30 @@ def test_dbscan_callable():
 
 def test_dbscan_metric_params():
     # Tests that DBSCAN works with the metrics_params argument.
-    # Parameters chosen specifically for this task.
-    # Different eps to other test, because distance is not normalised.
     eps = 0.8
     min_samples = 10
-    p = 2
-    # Compute DBSCAN
-    # parameters chosen for task
-    core_samples, labels = dbscan(X, metric='minkowski',
-                                  metric_params={'p': p}, eps=eps,
-                                  min_samples=min_samples,
-                                  algorithm='ball_tree')
+    p = 1
 
-    # number of clusters, ignoring noise if present
-    n_clusters_1 = len(set(labels)) - int(-1 in labels)
-    assert_equal(n_clusters_1, n_clusters)
+    # Compute DBSCAN with metric_params arg
+    db = DBSCAN(metric='minkowski', metric_params={'p': p}, eps=eps,
+                min_samples=min_samples, algorithm='ball_tree').fit(X)
+    core_sample_1, labels_1 = db.core_sample_indices_, db.labels_
 
+    # Test that sample labels are the same as passing Minkowski 'p' directly
     db = DBSCAN(metric='minkowski', eps=eps, min_samples=min_samples,
-                algorithm='ball_tree', p=p)
-    labels = db.fit(X).labels_
+                algorithm='ball_tree', p=p).fit(X)
+    core_sample_2, labels_2 = db.core_sample_indices_, db.labels_
 
-    n_clusters_2 = len(set(labels)) - int(-1 in labels)
-    assert_equal(n_clusters_2, n_clusters)
+    assert_array_equal(core_sample_1, core_sample_2)
+    assert_array_equal(labels_1, labels_2)
 
-    # Minkowski with p=2 should use Euclidean
-    db = DBSCAN(metric='euclidean', eps=eps, min_samples=min_samples,
-                algorithm='ball_tree')
-    labels = db.fit(X).labels_
+    # Minkowski with p=1 should be equivalent to Manhattan distance
+    db = DBSCAN(metric='manhattan', eps=eps, min_samples=min_samples,
+                algorithm='ball_tree').fit(X)
+    core_sample_3, labels_3 = db.core_sample_indices_, db.labels_
 
-    n_clusters_3 = len(set(labels)) - int(-1 in labels)
-    assert_equal(n_clusters_3, n_clusters)
+    assert_array_equal(core_sample_1, core_sample_3)
+    assert_array_equal(labels_1, labels_3)
 
 
 def test_dbscan_balltree():