From 310c1d438146d90eee49feab255e5c7379794b48 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 30 Apr 2019 16:16:25 +0800 Subject: [PATCH 1/8] paper typo --- sklearn/cluster/optics_.py | 2 +- sklearn/cluster/tests/test_optics.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 55f286404c9ee..fdda4a3a94f54 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -844,7 +844,7 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples, # Find the first index from the right side which is almost # at the same level as the beginning of the detected # cluster. - while (reachability_plot[c_end - 1] < D_max + while (reachability_plot[c_end - 1] > D_max and c_end > U_start): c_end -= 1 diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 6d23f0278df5a..2b980cb9e2f4c 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -127,6 +127,7 @@ def test_extract_xi(): def test_cluster_hierarchy_(): + rng = np.random.RandomState(0) n_points_per_cluster = 100 C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2) C2 = [0, 0] + 10 * rng.randn(n_points_per_cluster, 2) From 93de63e9d47c3ab6f1c81390102524b40868f4ed Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 30 Apr 2019 16:28:36 +0800 Subject: [PATCH 2/8] update test --- sklearn/cluster/tests/test_optics.py | 36 +++++++++++++--------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 2b980cb9e2f4c..751c999e952c4 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -84,7 +84,6 @@ def test_extract_xi(): # but with a clear noise data. rng = np.random.RandomState(0) n_points_per_cluster = 5 - C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2) C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2) C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2) @@ -93,37 +92,34 @@ def test_extract_xi(): C6 = [5, 6] + .2 * rng.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)) - expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, + expected_labels = np.r_[[0] * 5, [1] * 5, [2] * 5, [3] * 5, [2] * 5, -1, [4] * 5] - X, expected_labels = shuffle(X, expected_labels, random_state=rng) - - clust = OPTICS(min_samples=3, min_cluster_size=2, - max_eps=np.inf, cluster_method='xi', - xi=0.4).fit(X) - assert_array_equal(clust.labels_, expected_labels) + X, expected_labels = shuffle(X, expected_labels, random_state=0) + clust = OPTICS(min_samples=3, min_cluster_size=3, + max_eps=20, cluster_method='xi').fit(X) + assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1) + assert np.array_equal(np.where(clust.labels_ == -1)[0], + np.where(expected_labels == -1)[0]) X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)) - expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, + expected_labels = np.r_[[0] * 5, [1] * 5, [2] * 5, [3] * 5, [2] * 5, -1, -1, [4] * 5] - X, expected_labels = shuffle(X, expected_labels, random_state=rng) - clust = OPTICS(min_samples=3, min_cluster_size=3, - max_eps=np.inf, cluster_method='xi', - xi=0.1).fit(X) - # this may fail if the predecessor correction is not at work! - assert_array_equal(clust.labels_, expected_labels) + max_eps=20, cluster_method='xi').fit(X) + assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1) + assert np.array_equal(np.where(clust.labels_ == -1)[0], + np.where(expected_labels == -1)[0]) C1 = [[0, 0], [0, 0.1], [0, -.1], [0.1, 0]] C2 = [[10, 10], [10, 9], [10, 11], [9, 10]] C3 = [[100, 100], [100, 90], [100, 110], [90, 100]] X = np.vstack((C1, C2, C3)) expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4] - X, expected_labels = shuffle(X, expected_labels, random_state=rng) - clust = OPTICS(min_samples=2, min_cluster_size=2, - max_eps=np.inf, cluster_method='xi', - xi=0.04).fit(X) - assert_array_equal(clust.labels_, expected_labels) + max_eps=np.inf, cluster_method='xi').fit(X) + assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1) + assert np.array_equal(np.where(clust.labels_ == -1)[0], + np.where(expected_labels == -1)[0]) def test_cluster_hierarchy_(): From 7f0ce7a0b9d8cf3276f1080fac518db972bd4fe5 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 30 Apr 2019 16:30:33 +0800 Subject: [PATCH 3/8] remove test --- sklearn/cluster/tests/test_optics.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 751c999e952c4..c7cdb5a77764c 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -119,21 +119,7 @@ def test_extract_xi(): max_eps=np.inf, cluster_method='xi').fit(X) assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1) assert np.array_equal(np.where(clust.labels_ == -1)[0], - np.where(expected_labels == -1)[0]) - - -def test_cluster_hierarchy_(): - rng = np.random.RandomState(0) - n_points_per_cluster = 100 - C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2) - C2 = [0, 0] + 10 * rng.randn(n_points_per_cluster, 2) - X = np.vstack((C1, C2)) - X = shuffle(X, random_state=0) - - clusters = OPTICS(min_samples=20, xi=.1).fit(X).cluster_hierarchy_ - assert clusters.shape == (2, 2) - diff = np.sum(clusters - np.array([[0, 99], [0, 199]])) - assert diff / len(X) < 0.05 + np.where(expected_labels == -1)[0]) def test_correct_number_of_clusters(): From b71bcff24f453a41a2cde17bea95e3832848f8f2 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 30 Apr 2019 16:48:58 +0800 Subject: [PATCH 4/8] flake8 --- sklearn/cluster/optics_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index fdda4a3a94f54..9186ae953c50d 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -18,6 +18,7 @@ from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances +from ..metrics import v_measure_score class OPTICS(BaseEstimator, ClusterMixin): From 201cea8f5f23424f4f803a56519f796acc8b4fa6 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 30 Apr 2019 16:59:43 +0800 Subject: [PATCH 5/8] flake8 --- sklearn/cluster/optics_.py | 1 - sklearn/cluster/tests/test_optics.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 9186ae953c50d..fdda4a3a94f54 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -18,7 +18,6 @@ from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances -from ..metrics import v_measure_score class OPTICS(BaseEstimator, ClusterMixin): diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index c7cdb5a77764c..527035f0c57d7 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -9,6 +9,7 @@ from sklearn.cluster.optics_ import (OPTICS, _extend_region, _extract_xi_labels) +from sklearn.metrics import v_measure_score from sklearn.metrics.cluster import contingency_matrix from sklearn.metrics.pairwise import pairwise_distances from sklearn.cluster.dbscan_ import DBSCAN From 42db2e67ef8dc4c73f1bba1c84d89aed64a5d055 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 30 Apr 2019 20:35:55 +0800 Subject: [PATCH 6/8] revert changes in tests --- sklearn/cluster/tests/test_optics.py | 50 ++++++++++++++++++---------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 527035f0c57d7..2fcabf08c369c 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -9,7 +9,6 @@ from sklearn.cluster.optics_ import (OPTICS, _extend_region, _extract_xi_labels) -from sklearn.metrics import v_measure_score from sklearn.metrics.cluster import contingency_matrix from sklearn.metrics.pairwise import pairwise_distances from sklearn.cluster.dbscan_ import DBSCAN @@ -85,6 +84,7 @@ def test_extract_xi(): # but with a clear noise data. rng = np.random.RandomState(0) n_points_per_cluster = 5 + C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2) C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2) C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2) @@ -93,34 +93,50 @@ def test_extract_xi(): C6 = [5, 6] + .2 * rng.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)) - expected_labels = np.r_[[0] * 5, [1] * 5, [2] * 5, [3] * 5, [2] * 5, + expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5] - X, expected_labels = shuffle(X, expected_labels, random_state=0) - clust = OPTICS(min_samples=3, min_cluster_size=3, - max_eps=20, cluster_method='xi').fit(X) - assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1) - assert np.array_equal(np.where(clust.labels_ == -1)[0], - np.where(expected_labels == -1)[0]) + X, expected_labels = shuffle(X, expected_labels, random_state=rng) + + clust = OPTICS(min_samples=3, min_cluster_size=2, + max_eps=20, cluster_method='xi', + xi=0.4).fit(X) + assert_array_equal(clust.labels_, expected_labels) X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)) - expected_labels = np.r_[[0] * 5, [1] * 5, [2] * 5, [3] * 5, [2] * 5, + expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5] + X, expected_labels = shuffle(X, expected_labels, random_state=rng) + clust = OPTICS(min_samples=3, min_cluster_size=3, - max_eps=20, cluster_method='xi').fit(X) - assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1) - assert np.array_equal(np.where(clust.labels_ == -1)[0], - np.where(expected_labels == -1)[0]) + max_eps=20, cluster_method='xi', + xi=0.1).fit(X) + # this may fail if the predecessor correction is not at work! + assert_array_equal(clust.labels_, expected_labels) C1 = [[0, 0], [0, 0.1], [0, -.1], [0.1, 0]] C2 = [[10, 10], [10, 9], [10, 11], [9, 10]] C3 = [[100, 100], [100, 90], [100, 110], [90, 100]] X = np.vstack((C1, C2, C3)) expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4] + X, expected_labels = shuffle(X, expected_labels, random_state=rng) + clust = OPTICS(min_samples=2, min_cluster_size=2, - max_eps=np.inf, cluster_method='xi').fit(X) - assert np.isclose(v_measure_score(clust.labels_, expected_labels), 1) - assert np.array_equal(np.where(clust.labels_ == -1)[0], - np.where(expected_labels == -1)[0]) + max_eps=np.inf, cluster_method='xi', + xi=0.04).fit(X) + assert_array_equal(clust.labels_, expected_labels) + + +def test_cluster_hierarchy_(): + n_points_per_cluster = 100 + C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2) + C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2) + X = np.vstack((C1, C2)) + X = shuffle(X, random_state=0) + + clusters = OPTICS(min_samples=20, xi=.1).fit(X).cluster_hierarchy_ + assert clusters.shape == (2, 2) + diff = np.sum(clusters - np.array([[0, 99], [0, 199]])) + assert diff / len(X) < 0.05 def test_correct_number_of_clusters(): From 2f50cfb90789c96b381d773693ee858ecd17d69b Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 30 Apr 2019 20:44:36 +0800 Subject: [PATCH 7/8] Joel's comment --- sklearn/cluster/optics_.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index fdda4a3a94f54..c211b86a30eab 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -844,6 +844,9 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples, # Find the first index from the right side which is almost # at the same level as the beginning of the detected # cluster. + # Our implementation corrects a mistake in the original + # paper, i.e., in Definition 11 4c, r(x) < r(sD) should be + # r(x) > r(sD). while (reachability_plot[c_end - 1] > D_max and c_end > U_start): c_end -= 1 From 346788c8a0010a863e3069020613cf25fa96566d Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 30 Apr 2019 20:48:29 +0800 Subject: [PATCH 8/8] random_state --- sklearn/cluster/tests/test_optics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 2fcabf08c369c..1e3d99746c9e9 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -127,6 +127,7 @@ def test_extract_xi(): def test_cluster_hierarchy_(): + rng = np.random.RandomState(0) n_points_per_cluster = 100 C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2) C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2)