From 310c1d438146d90eee49feab255e5c7379794b48 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Tue, 30 Apr 2019 16:16:25 +0800
Subject: [PATCH 1/8] paper typo

---
 sklearn/cluster/optics_.py           | 2 +-
 sklearn/cluster/tests/test_optics.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index 55f286404c9ee..fdda4a3a94f54 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -844,7 +844,7 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
                     # Find the first index from the right side which is almost
                     # at the same level as the beginning of the detected
                     # cluster.
-                    while (reachability_plot[c_end - 1] < D_max
+                    while (reachability_plot[c_end - 1] > D_max
                            and c_end > U_start):
                         c_end -= 1
 
diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 6d23f0278df5a..2b980cb9e2f4c 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -127,6 +127,7 @@ def test_extract_xi():
 
 
 def test_cluster_hierarchy_():
+    rng = np.random.RandomState(0)
     n_points_per_cluster = 100
     C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2)
     C2 = [0, 0] + 10 * rng.randn(n_points_per_cluster, 2)

From 93de63e9d47c3ab6f1c81390102524b40868f4ed Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Tue, 30 Apr 2019 16:28:36 +0800
Subject: [PATCH 2/8] update test

---
 sklearn/cluster/tests/test_optics.py | 36 +++++++++++++---------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 2b980cb9e2f4c..751c999e952c4 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -84,7 +84,6 @@ def test_extract_xi():
     # but with a clear noise data.
     rng = np.random.RandomState(0)
     n_points_per_cluster = 5
-
     C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2)
     C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2)
     C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2)
@@ -93,37 +92,34 @@ def test_extract_xi():
     C6 = [5, 6] + .2 * rng.randn(n_points_per_cluster, 2)
 
     X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6))
-    expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5,
+    expected_labels = np.r_[[0] * 5, [1] * 5, [2] * 5, [3] * 5, [2] * 5,
                             -1, [4] * 5]
-    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
-
-    clust = OPTICS(min_samples=3, min_cluster_size=2,
-                   max_eps=np.inf, cluster_method='xi',
-                   xi=0.4).fit(X)
-    assert_array_equal(clust.labels_, expected_labels)
+    X, expected_labels = shuffle(X, expected_labels, random_state=0)
+    clust = OPTICS(min_samples=3, min_cluster_size=3,
+                   max_eps=20, cluster_method='xi').fit(X)
+    assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1)
+    assert np.array_equal(np.where(clust.labels_ == -1)[0],
+                          np.where(expected_labels == -1)[0])
 
     X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6))
-    expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5,
+    expected_labels = np.r_[[0] * 5, [1] * 5, [2] * 5, [3] * 5, [2] * 5,
                             -1, -1, [4] * 5]
-    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
-
     clust = OPTICS(min_samples=3, min_cluster_size=3,
-                   max_eps=np.inf, cluster_method='xi',
-                   xi=0.1).fit(X)
-    # this may fail if the predecessor correction is not at work!
-    assert_array_equal(clust.labels_, expected_labels)
+                   max_eps=20, cluster_method='xi').fit(X)
+    assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1)
+    assert np.array_equal(np.where(clust.labels_ == -1)[0],
+                          np.where(expected_labels == -1)[0])
 
     C1 = [[0, 0], [0, 0.1], [0, -.1], [0.1, 0]]
     C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
     C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
     X = np.vstack((C1, C2, C3))
     expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
-    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
-
     clust = OPTICS(min_samples=2, min_cluster_size=2,
-                   max_eps=np.inf, cluster_method='xi',
-                   xi=0.04).fit(X)
-    assert_array_equal(clust.labels_, expected_labels)
+                   max_eps=np.inf, cluster_method='xi').fit(X)
+    assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1)
+    assert np.array_equal(np.where(clust.labels_ == -1)[0],
+                      np.where(expected_labels == -1)[0])
 
 
 def test_cluster_hierarchy_():

From 7f0ce7a0b9d8cf3276f1080fac518db972bd4fe5 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Tue, 30 Apr 2019 16:30:33 +0800
Subject: [PATCH 3/8] remove test

---
 sklearn/cluster/tests/test_optics.py | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 751c999e952c4..c7cdb5a77764c 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -119,21 +119,7 @@ def test_extract_xi():
                    max_eps=np.inf, cluster_method='xi').fit(X)
     assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1)
     assert np.array_equal(np.where(clust.labels_ == -1)[0],
-                      np.where(expected_labels == -1)[0])
-
-
-def test_cluster_hierarchy_():
-    rng = np.random.RandomState(0)
-    n_points_per_cluster = 100
-    C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2)
-    C2 = [0, 0] + 10 * rng.randn(n_points_per_cluster, 2)
-    X = np.vstack((C1, C2))
-    X = shuffle(X, random_state=0)
-
-    clusters = OPTICS(min_samples=20, xi=.1).fit(X).cluster_hierarchy_
-    assert clusters.shape == (2, 2)
-    diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
-    assert diff / len(X) < 0.05
+                          np.where(expected_labels == -1)[0])
 
 
 def test_correct_number_of_clusters():

From b71bcff24f453a41a2cde17bea95e3832848f8f2 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Tue, 30 Apr 2019 16:48:58 +0800
Subject: [PATCH 4/8] flake8

---
 sklearn/cluster/optics_.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index fdda4a3a94f54..9186ae953c50d 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -18,6 +18,7 @@
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics import pairwise_distances
+from ..metrics import v_measure_score
 
 
 class OPTICS(BaseEstimator, ClusterMixin):

From 201cea8f5f23424f4f803a56519f796acc8b4fa6 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Tue, 30 Apr 2019 16:59:43 +0800
Subject: [PATCH 5/8] flake8

---
 sklearn/cluster/optics_.py           | 1 -
 sklearn/cluster/tests/test_optics.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index 9186ae953c50d..fdda4a3a94f54 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -18,7 +18,6 @@
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics import pairwise_distances
-from ..metrics import v_measure_score
 
 
 class OPTICS(BaseEstimator, ClusterMixin):
diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index c7cdb5a77764c..527035f0c57d7 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -9,6 +9,7 @@
 from sklearn.cluster.optics_ import (OPTICS,
                                      _extend_region,
                                      _extract_xi_labels)
+from sklearn.metrics import v_measure_score
 from sklearn.metrics.cluster import contingency_matrix
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.cluster.dbscan_ import DBSCAN

From 42db2e67ef8dc4c73f1bba1c84d89aed64a5d055 Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Tue, 30 Apr 2019 20:35:55 +0800
Subject: [PATCH 6/8] revert changes in tests

---
 sklearn/cluster/tests/test_optics.py | 50 ++++++++++++++++++----------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 527035f0c57d7..2fcabf08c369c 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -9,7 +9,6 @@
 from sklearn.cluster.optics_ import (OPTICS,
                                      _extend_region,
                                      _extract_xi_labels)
-from sklearn.metrics import v_measure_score
 from sklearn.metrics.cluster import contingency_matrix
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.cluster.dbscan_ import DBSCAN
@@ -85,6 +84,7 @@ def test_extract_xi():
     # but with a clear noise data.
     rng = np.random.RandomState(0)
     n_points_per_cluster = 5
+
     C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2)
     C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2)
     C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2)
@@ -93,34 +93,50 @@ def test_extract_xi():
     C6 = [5, 6] + .2 * rng.randn(n_points_per_cluster, 2)
 
     X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6))
-    expected_labels = np.r_[[0] * 5, [1] * 5, [2] * 5, [3] * 5, [2] * 5,
+    expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5,
                             -1, [4] * 5]
-    X, expected_labels = shuffle(X, expected_labels, random_state=0)
-    clust = OPTICS(min_samples=3, min_cluster_size=3,
-                   max_eps=20, cluster_method='xi').fit(X)
-    assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1)
-    assert np.array_equal(np.where(clust.labels_ == -1)[0],
-                          np.where(expected_labels == -1)[0])
+    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+
+    clust = OPTICS(min_samples=3, min_cluster_size=2,
+                   max_eps=20, cluster_method='xi',
+                   xi=0.4).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
 
     X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6))
-    expected_labels = np.r_[[0] * 5, [1] * 5, [2] * 5, [3] * 5, [2] * 5,
+    expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5,
                             -1, -1, [4] * 5]
+    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+
     clust = OPTICS(min_samples=3, min_cluster_size=3,
-                   max_eps=20, cluster_method='xi').fit(X)
-    assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1)
-    assert np.array_equal(np.where(clust.labels_ == -1)[0],
-                          np.where(expected_labels == -1)[0])
+                   max_eps=20, cluster_method='xi',
+                   xi=0.1).fit(X)
+    # this may fail if the predecessor correction is not at work!
+    assert_array_equal(clust.labels_, expected_labels)
 
     C1 = [[0, 0], [0, 0.1], [0, -.1], [0.1, 0]]
     C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
     C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
     X = np.vstack((C1, C2, C3))
     expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
+    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+
     clust = OPTICS(min_samples=2, min_cluster_size=2,
-                   max_eps=np.inf, cluster_method='xi').fit(X)
-    assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1)
-    assert np.array_equal(np.where(clust.labels_ == -1)[0],
-                          np.where(expected_labels == -1)[0])
+                   max_eps=np.inf, cluster_method='xi',
+                   xi=0.04).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+
+def test_cluster_hierarchy_():
+    n_points_per_cluster = 100
+    C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2)
+    C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2)
+    X = np.vstack((C1, C2))
+    X = shuffle(X, random_state=0)
+
+    clusters = OPTICS(min_samples=20, xi=.1).fit(X).cluster_hierarchy_
+    assert clusters.shape == (2, 2)
+    diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
+    assert diff / len(X) < 0.05
 
 
 def test_correct_number_of_clusters():

From 2f50cfb90789c96b381d773693ee858ecd17d69b Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Tue, 30 Apr 2019 20:44:36 +0800
Subject: [PATCH 7/8] Joel's comment

---
 sklearn/cluster/optics_.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index fdda4a3a94f54..c211b86a30eab 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -844,6 +844,9 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
                     # Find the first index from the right side which is almost
                     # at the same level as the beginning of the detected
                     # cluster.
+                    # Our implementation corrects a mistake in the original
+                    # paper, i.e., in Definition 11 4c, r(x) < r(sD) should be
+                    # r(x) > r(sD).
                     while (reachability_plot[c_end - 1] > D_max
                            and c_end > U_start):
                         c_end -= 1

From 346788c8a0010a863e3069020613cf25fa96566d Mon Sep 17 00:00:00 2001
From: Hanmin Qin <qinhanmin2005@sina.com>
Date: Tue, 30 Apr 2019 20:48:29 +0800
Subject: [PATCH 8/8] random_state

---
 sklearn/cluster/tests/test_optics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 2fcabf08c369c..1e3d99746c9e9 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -127,6 +127,7 @@ def test_extract_xi():
 
 
 def test_cluster_hierarchy_():
+    rng = np.random.RandomState(0)
     n_points_per_cluster = 100
     C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2)
     C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2)