From f25a0eb82d76aaa14119e44bf6326b912c60d572 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 10:40:22 -0500 Subject: [PATCH 01/53] First cut at basic single linkage internals --- sklearn/cluster/_single_linkage.pyx | 206 ++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 sklearn/cluster/_single_linkage.pyx diff --git a/sklearn/cluster/_single_linkage.pyx b/sklearn/cluster/_single_linkage.pyx new file mode 100644 index 0000000000000..635fe507a646e --- /dev/null +++ b/sklearn/cluster/_single_linkage.pyx @@ -0,0 +1,206 @@ +# cython: boundscheck=False +# cython: nonecheck=False +# Minimum spanning tree single linkage implementation +# Authors: Leland McInnes, Steve Astels +# License: 3-clause BSD + +import numpy as np +cimport numpy as np + +from libc.float cimport DBL_MAX + +from dist_metrics cimport DistanceMetric + + +cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( + np.ndarray[np.double_t, + ndim=2] distance_matrix): + + cdef np.ndarray[np.intp_t, ndim=1] node_labels + cdef np.ndarray[np.intp_t, ndim=1] current_labels + cdef np.ndarray[np.double_t, ndim=1] current_distances + cdef np.ndarray[np.double_t, ndim=1] left + cdef np.ndarray[np.double_t, ndim=1] right + cdef np.ndarray[np.double_t, ndim=2] result + + cdef np.ndarray label_filter + + cdef np.intp_t current_node + cdef np.intp_t new_node_index + cdef np.intp_t new_node + cdef np.intp_t i + + result = np.zeros((distance_matrix.shape[0] - 1, 3)) + node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp) + current_node = 0 + current_distances = np.infty * np.ones(distance_matrix.shape[0]) + current_labels = node_labels + for i in range(1, node_labels.shape[0]): + label_filter = current_labels != current_node + current_labels = current_labels[label_filter] + left = current_distances[label_filter] + right = distance_matrix[current_node][current_labels] + current_distances = np.where(left < right, left, right) + + new_node_index = np.argmin(current_distances) + new_node = current_labels[new_node_index] + result[i - 1, 0] = current_node + result[i - 1, 1] = new_node + result[i - 1, 2] = current_distances[new_node_index] + current_node = new_node + + return result + + +cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector( + np.ndarray[np.double_t, ndim=2, mode='c'] raw_data, + DistanceMetric dist_metric): + + # Add a comment + cdef np.ndarray[np.double_t, ndim=1] current_distances_arr + cdef np.ndarray[np.int8_t, ndim=1] in_tree_arr + cdef np.ndarray[np.double_t, ndim=2] result_arr + + cdef np.double_t * current_distances + cdef np.double_t * raw_data_ptr + cdef np.int8_t * in_tree + cdef np.double_t[:, ::1] raw_data_view + cdef np.double_t[:, ::1] result + + cdef np.ndarray label_filter + + cdef np.intp_t current_node + cdef np.intp_t new_node + cdef np.intp_t i + cdef np.intp_t j + cdef np.intp_t dim + cdef np.intp_t num_features + + cdef double right_value + cdef double left_value + cdef double core_value + cdef double new_distance + + dim = raw_data.shape[0] + num_features = raw_data.shape[1] + + raw_data_view = ( ( + raw_data.data)) + raw_data_ptr = ( &raw_data_view[0, 0]) + + result_arr = np.zeros((dim - 1, 3)) + in_tree_arr = np.zeros(dim, dtype=np.int8) + current_node = 0 + current_distances_arr = np.infty * np.ones(dim) + + result = ( ( result_arr.data)) + in_tree = ( in_tree_arr.data) + current_distances = ( current_distances_arr.data) + + for i in range(1, dim): + + in_tree[current_node] = 1 + + new_distance = DBL_MAX + new_node = 0 + + for j in range(dim): + if in_tree[j]: + continue + + right_value = current_distances[j] + left_value = dist_metric.dist(&raw_data_ptr[num_features * + current_node], + &raw_data_ptr[num_features * j], + num_features) + + if left_value > right_value: + if right_value < new_distance: + new_distance = right_value + new_node = j + continue + + if left_value < right_value: + current_distances[j] = left_value + if left_value < new_distance: + new_distance = left_value + new_node = j + else: + if right_value < new_distance: + new_distance = right_value + new_node = j + + result[i - 1, 0] = current_node + result[i - 1, 1] = new_node + result[i - 1, 2] = new_distance + current_node = new_node + + return result_arr + + +cdef class UnionFind (object): + + cdef np.ndarray parent_arr + cdef np.ndarray size_arr + cdef np.intp_t next_label + cdef np.intp_t *parent + cdef np.intp_t *size + + def __init__(self, N): + self.parent_arr = -1 * np.ones(2 * N - 1, dtype=np.intp, order='C') + self.next_label = N + self.size_arr = np.hstack((np.ones(N, dtype=np.intp), + np.zeros(N-1, dtype=np.intp))) + self.parent = ( self.parent_arr.data) + self.size = ( self.size_arr.data) + + cdef void union(self, np.intp_t m, np.intp_t n): + self.size[self.next_label] = self.size[m] + self.size[n] + self.parent[m] = self.next_label + self.parent[n] = self.next_label + self.size[self.next_label] = self.size[m] + self.size[n] + self.next_label += 1 + + return + + cdef np.intp_t fast_find(self, np.intp_t n): + cdef np.intp_t p + p = n + while self.parent_arr[n] != -1: + n = self.parent_arr[n] + # label up to the root + while self.parent_arr[p] != n: + p, self.parent_arr[p] = self.parent_arr[p], n + return n + + +cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L): + + cdef np.ndarray[np.double_t, ndim=2] result_arr + cdef np.double_t[:, ::1] result + + cdef np.intp_t N, a, aa, b, bb, index + cdef np.double_t delta + + result_arr = np.zeros((L.shape[0], L.shape[1] + 1)) + result = ( ( + result_arr.data)) + N = L.shape[0] + 1 + U = UnionFind(N) + + for index in range(L.shape[0]): + + a = L[index, 0] + b = L[index, 1] + delta = L[index, 2] + + aa, bb = U.fast_find(a), U.fast_find(b) + + result[index][0] = aa + result[index][1] = bb + result[index][2] = delta + result[index][3] = U.size[aa] + U.size[bb] + + U.union(aa, bb) + + return result_arr From 2ed479979d9019dfa105530417e9b086dd50f11f Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 10:40:48 -0500 Subject: [PATCH 02/53] Refer to correct dist_metrics package --- sklearn/cluster/_single_linkage.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_single_linkage.pyx b/sklearn/cluster/_single_linkage.pyx index 635fe507a646e..6f22acf7f2682 100644 --- a/sklearn/cluster/_single_linkage.pyx +++ b/sklearn/cluster/_single_linkage.pyx @@ -9,7 +9,7 @@ cimport numpy as np from libc.float cimport DBL_MAX -from dist_metrics cimport DistanceMetric +from .neighbors.dist_metrics cimport DistanceMetric cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( From acfbddfb8fd54c3897b15fcd45194dfc0a47cd2a Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 11:19:01 -0500 Subject: [PATCH 03/53] Add csgraph sparse implementation for single linkage --- sklearn/cluster/hierarchical.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 29d725bd8ce54..e5e6343c1cd8c 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -374,7 +374,8 @@ def linkage_tree(X, connectivity=None, n_components=None, n_samples, n_features = X.shape linkage_choices = {'complete': _hierarchical.max_merge, - 'average': _hierarchical.average_merge} + 'average': _hierarchical.average_merge, + 'single' : None} # Single linkage is handled differently try: join_func = linkage_choices[linkage] except KeyError: @@ -438,6 +439,29 @@ def linkage_tree(X, connectivity=None, n_components=None, metric=affinity) connectivity.data = distances + if linkage == 'single': + from scipy.sparse.csgraph import minimum_spanning_tree + + # Use scipy.sparse.csgraph to generate a minimum spanning tree + mst = minimum_spanning_tree(connectivity.tocsr()) + + # Convert the graph to scipy cluster array format + nonzeros = mst.nonzero() + nonzero_vals = mst.data + mst_array = np.vstack(nonzeros + (nonzero_vals,)).T + + # Sort edges of the min_spanning_tree by weight + mst_array = mst_array[np.argsort(mst_array.T[2]),:][0] + + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = _hierarchical.single_linkage_label(mst_array) + children_ = single_linkage_tree[:, :2].astype(np.int) + + if return_distance: + distances = single_linkage_tree[:, 2] + return children_, 1, n_samples, None, distances + return children_, 1, n_samples, None + if n_clusters is None: n_nodes = 2 * n_samples - 1 else: From 2d5a95e2e1cd95f1e4c081e2d474c5d19b9dfd9e Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 11:26:30 -0500 Subject: [PATCH 04/53] Add fast labelling/conversion from MST to single linkage tree; remove uneeded single_linkage.pyx file. --- sklearn/cluster/_hierarchical.pyx | 76 ++++++++++ sklearn/cluster/_single_linkage.pyx | 206 ---------------------------- 2 files changed, 76 insertions(+), 206 deletions(-) delete mode 100644 sklearn/cluster/_single_linkage.pyx diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index 41daffd2b14c1..33aa2b80fb448 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -332,3 +332,79 @@ cdef class WeightedEdge: self.weight, self.a, self.b) + +################################################################################ +# Efficient labelling/conversion of MSTs to single linkage hierarchies + +cdef class UnionFind (object): + + cdef np.ndarray parent_arr + cdef np.ndarray size_arr + cdef ITYPE_t next_label + cdef ITYPE_t *parent + cdef ITYPE_t *size + + def __init__(self, N): + self.parent_arr = -1 * np.ones(2 * N - 1, dtype=ITYPE, order='C') + self.next_label = N + self.size_arr = np.hstack((np.ones(N, dtype=ITYPE), + np.zeros(N-1, dtype=ITYPE))) + self.parent = ( self.parent_arr.data) + self.size = ( self.size_arr.data) + + @cython.boundscheck(False) + @cython.nonecheck(False) + cdef void union(self, ITYPE_t m, ITYPE_t n): + self.size[self.next_label] = self.size[m] + self.size[n] + self.parent[m] = self.next_label + self.parent[n] = self.next_label + self.size[self.next_label] = self.size[m] + self.size[n] + self.next_label += 1 + + return + + @cython.boundscheck(False) + @cython.nonecheck(False) + cdef ITYPE_t fast_find(self, ITYPE_t n): + cdef ITYPE_t p + p = n + while self.parent_arr[n] != -1: + n = self.parent_arr[n] + # label up to the root + while self.parent_arr[p] != n: + p, self.parent_arr[p] = self.parent_arr[p], n + return n + + +@cython.boundscheck(False) +@cython.nonecheck(False) +cpdef np.ndarray[DTYPE_t, ndim=2] label(np.ndarray[DTYPE_t, ndim=2] L): + + cdef np.ndarray[DTYPE_t, ndim=2] result_arr + cdef DTYPE_t[:, ::1] result + + cdef ITYPE_t N, a, aa, b, bb, index + cdef DTYPE_t delta + + result_arr = np.zeros((L.shape[0], L.shape[1] + 1), dtype=DTYPE) + result = ( ( + result_arr.data)) + N = L.shape[0] + 1 + U = UnionFind(N) + + for index in range(L.shape[0]): + + a = L[index, 0] + b = L[index, 1] + delta = L[index, 2] + + aa, bb = U.fast_find(a), U.fast_find(b) + + result[index][0] = aa + result[index][1] = bb + result[index][2] = delta + result[index][3] = U.size[aa] + U.size[bb] + + U.union(aa, bb) + + return result_arr diff --git a/sklearn/cluster/_single_linkage.pyx b/sklearn/cluster/_single_linkage.pyx deleted file mode 100644 index 6f22acf7f2682..0000000000000 --- a/sklearn/cluster/_single_linkage.pyx +++ /dev/null @@ -1,206 +0,0 @@ -# cython: boundscheck=False -# cython: nonecheck=False -# Minimum spanning tree single linkage implementation -# Authors: Leland McInnes, Steve Astels -# License: 3-clause BSD - -import numpy as np -cimport numpy as np - -from libc.float cimport DBL_MAX - -from .neighbors.dist_metrics cimport DistanceMetric - - -cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( - np.ndarray[np.double_t, - ndim=2] distance_matrix): - - cdef np.ndarray[np.intp_t, ndim=1] node_labels - cdef np.ndarray[np.intp_t, ndim=1] current_labels - cdef np.ndarray[np.double_t, ndim=1] current_distances - cdef np.ndarray[np.double_t, ndim=1] left - cdef np.ndarray[np.double_t, ndim=1] right - cdef np.ndarray[np.double_t, ndim=2] result - - cdef np.ndarray label_filter - - cdef np.intp_t current_node - cdef np.intp_t new_node_index - cdef np.intp_t new_node - cdef np.intp_t i - - result = np.zeros((distance_matrix.shape[0] - 1, 3)) - node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp) - current_node = 0 - current_distances = np.infty * np.ones(distance_matrix.shape[0]) - current_labels = node_labels - for i in range(1, node_labels.shape[0]): - label_filter = current_labels != current_node - current_labels = current_labels[label_filter] - left = current_distances[label_filter] - right = distance_matrix[current_node][current_labels] - current_distances = np.where(left < right, left, right) - - new_node_index = np.argmin(current_distances) - new_node = current_labels[new_node_index] - result[i - 1, 0] = current_node - result[i - 1, 1] = new_node - result[i - 1, 2] = current_distances[new_node_index] - current_node = new_node - - return result - - -cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector( - np.ndarray[np.double_t, ndim=2, mode='c'] raw_data, - DistanceMetric dist_metric): - - # Add a comment - cdef np.ndarray[np.double_t, ndim=1] current_distances_arr - cdef np.ndarray[np.int8_t, ndim=1] in_tree_arr - cdef np.ndarray[np.double_t, ndim=2] result_arr - - cdef np.double_t * current_distances - cdef np.double_t * raw_data_ptr - cdef np.int8_t * in_tree - cdef np.double_t[:, ::1] raw_data_view - cdef np.double_t[:, ::1] result - - cdef np.ndarray label_filter - - cdef np.intp_t current_node - cdef np.intp_t new_node - cdef np.intp_t i - cdef np.intp_t j - cdef np.intp_t dim - cdef np.intp_t num_features - - cdef double right_value - cdef double left_value - cdef double core_value - cdef double new_distance - - dim = raw_data.shape[0] - num_features = raw_data.shape[1] - - raw_data_view = ( ( - raw_data.data)) - raw_data_ptr = ( &raw_data_view[0, 0]) - - result_arr = np.zeros((dim - 1, 3)) - in_tree_arr = np.zeros(dim, dtype=np.int8) - current_node = 0 - current_distances_arr = np.infty * np.ones(dim) - - result = ( ( result_arr.data)) - in_tree = ( in_tree_arr.data) - current_distances = ( current_distances_arr.data) - - for i in range(1, dim): - - in_tree[current_node] = 1 - - new_distance = DBL_MAX - new_node = 0 - - for j in range(dim): - if in_tree[j]: - continue - - right_value = current_distances[j] - left_value = dist_metric.dist(&raw_data_ptr[num_features * - current_node], - &raw_data_ptr[num_features * j], - num_features) - - if left_value > right_value: - if right_value < new_distance: - new_distance = right_value - new_node = j - continue - - if left_value < right_value: - current_distances[j] = left_value - if left_value < new_distance: - new_distance = left_value - new_node = j - else: - if right_value < new_distance: - new_distance = right_value - new_node = j - - result[i - 1, 0] = current_node - result[i - 1, 1] = new_node - result[i - 1, 2] = new_distance - current_node = new_node - - return result_arr - - -cdef class UnionFind (object): - - cdef np.ndarray parent_arr - cdef np.ndarray size_arr - cdef np.intp_t next_label - cdef np.intp_t *parent - cdef np.intp_t *size - - def __init__(self, N): - self.parent_arr = -1 * np.ones(2 * N - 1, dtype=np.intp, order='C') - self.next_label = N - self.size_arr = np.hstack((np.ones(N, dtype=np.intp), - np.zeros(N-1, dtype=np.intp))) - self.parent = ( self.parent_arr.data) - self.size = ( self.size_arr.data) - - cdef void union(self, np.intp_t m, np.intp_t n): - self.size[self.next_label] = self.size[m] + self.size[n] - self.parent[m] = self.next_label - self.parent[n] = self.next_label - self.size[self.next_label] = self.size[m] + self.size[n] - self.next_label += 1 - - return - - cdef np.intp_t fast_find(self, np.intp_t n): - cdef np.intp_t p - p = n - while self.parent_arr[n] != -1: - n = self.parent_arr[n] - # label up to the root - while self.parent_arr[p] != n: - p, self.parent_arr[p] = self.parent_arr[p], n - return n - - -cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L): - - cdef np.ndarray[np.double_t, ndim=2] result_arr - cdef np.double_t[:, ::1] result - - cdef np.intp_t N, a, aa, b, bb, index - cdef np.double_t delta - - result_arr = np.zeros((L.shape[0], L.shape[1] + 1)) - result = ( ( - result_arr.data)) - N = L.shape[0] + 1 - U = UnionFind(N) - - for index in range(L.shape[0]): - - a = L[index, 0] - b = L[index, 1] - delta = L[index, 2] - - aa, bb = U.fast_find(a), U.fast_find(b) - - result[index][0] = aa - result[index][1] = bb - result[index][2] = delta - result[index][3] = U.size[aa] + U.size[bb] - - U.union(aa, bb) - - return result_arr From b5fa65bd5ce97f694937eca3187d476d1440da1d Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 11:56:31 -0500 Subject: [PATCH 05/53] Ensure existing tests cover single linkage --- sklearn/cluster/hierarchical.py | 7 ++++++- sklearn/cluster/tests/test_hierarchical.py | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index e5e6343c1cd8c..4c5b3d1a3791c 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -550,11 +550,16 @@ def _average_linkage(*args, **kwargs): kwargs['linkage'] = 'average' return linkage_tree(*args, **kwargs) +def _single_linkage(*args, **kwargs): + kwargs['linkage'] = 'single' + return linkage_tree(*args, **kwargs) + _TREE_BUILDERS = dict( ward=ward_tree, complete=_complete_linkage, - average=_average_linkage) + average=_average_linkage, + single=_single_linkage) ############################################################################### diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 986b92e0ce9f4..7cacc6418acae 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -134,7 +134,7 @@ def test_agglomerative_clustering(): n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) - for linkage in ("ward", "complete", "average"): + for linkage in ("ward", "complete", "average", "single"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) @@ -340,7 +340,7 @@ def test_ward_linkage_tree_return_distance(): assert_array_almost_equal(dist_unstructured, dist_structured) - for linkage in ['average', 'complete']: + for linkage in ['average', 'complete', 'single']: structured_items = linkage_tree( X, connectivity=connectivity, linkage=linkage, return_distance=True)[-1] @@ -398,7 +398,7 @@ def test_ward_linkage_tree_return_distance(): assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4]) - linkage_options = ['complete', 'average'] + linkage_options = ['complete', 'average', 'single'] X_linkage_truth = [linkage_X_complete, linkage_X_average] for (linkage, X_truth) in zip(linkage_options, X_linkage_truth): out_X_unstructured = linkage_tree( From 2d25d1ccc495ddb864f4ab275ee472c55c736aa8 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 14:58:18 -0500 Subject: [PATCH 06/53] Name cingle linkage labelling correctly. --- sklearn/cluster/_hierarchical.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index 33aa2b80fb448..151a412f461b6 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -378,7 +378,8 @@ cdef class UnionFind (object): @cython.boundscheck(False) @cython.nonecheck(False) -cpdef np.ndarray[DTYPE_t, ndim=2] label(np.ndarray[DTYPE_t, ndim=2] L): +cpdef np.ndarray[DTYPE_t, ndim=2] single_linkage_label( + np.ndarray[DTYPE_t, ndim=2] L): cdef np.ndarray[DTYPE_t, ndim=2] result_arr cdef DTYPE_t[:, ::1] result From 0a14920c4315d35d6c59e9a7a211cf1653485fd3 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 14:59:19 -0500 Subject: [PATCH 07/53] Iterating toward correct solution. Still have to get n_clusters, compute_full_tree=False working --- sklearn/cluster/hierarchical.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 4c5b3d1a3791c..de0d99c5ccebe 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -439,6 +439,12 @@ def linkage_tree(X, connectivity=None, n_components=None, metric=affinity) connectivity.data = distances + if n_clusters is None: + n_nodes = 2 * n_samples - 1 + else: + assert n_clusters <= n_samples + n_nodes = 2 * n_samples - n_clusters + if linkage == 'single': from scipy.sparse.csgraph import minimum_spanning_tree @@ -451,22 +457,26 @@ def linkage_tree(X, connectivity=None, n_components=None, mst_array = np.vstack(nonzeros + (nonzero_vals,)).T # Sort edges of the min_spanning_tree by weight - mst_array = mst_array[np.argsort(mst_array.T[2]),:][0] + mst_array = mst_array[np.argsort(mst_array.T[2]),:] # Convert edge list into standard hierarchical clustering format single_linkage_tree = _hierarchical.single_linkage_label(mst_array) children_ = single_linkage_tree[:, :2].astype(np.int) + # Compute parents + parent = np.zeros(n_nodes, dtype=np.intp) + for i, (left, right) in enumerate(children_): + if n_clusters is not None and i >= n_samples - n_clusters: + break + if left < n_nodes: + parent[left] = i + n_samples + if right < n_nodes: + parent[right] = i + n_samples + if return_distance: distances = single_linkage_tree[:, 2] - return children_, 1, n_samples, None, distances - return children_, 1, n_samples, None - - if n_clusters is None: - n_nodes = 2 * n_samples - 1 - else: - assert n_clusters <= n_samples - n_nodes = 2 * n_samples - n_clusters + return children_, 1, n_samples, parent, distances + return children_, 1, n_samples, parent if return_distance: distances = np.empty(n_nodes - n_samples) From 71a3c9870f5b5a1faebaf830192a5d82482bc938 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 15:06:39 -0500 Subject: [PATCH 08/53] Get n_components correct. --- sklearn/cluster/hierarchical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index de0d99c5ccebe..8bb43dd3bfa50 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -475,8 +475,8 @@ def linkage_tree(X, connectivity=None, n_components=None, if return_distance: distances = single_linkage_tree[:, 2] - return children_, 1, n_samples, parent, distances - return children_, 1, n_samples, parent + return children_, n_components, n_samples, parent, distances + return children_, n_components, n_samples, parent if return_distance: distances = np.empty(n_nodes - n_samples) From 801ffa18b3cd275fa44210bbc98f2ca09aac5366 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 15:10:32 -0500 Subject: [PATCH 09/53] Update docstrings. --- sklearn/cluster/hierarchical.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 8bb43dd3bfa50..225e35f10215e 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -288,7 +288,7 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False): return children, n_components, n_leaves, parent -# average and complete linkage +# single, average and complete linkage def linkage_tree(X, connectivity=None, n_components=None, n_clusters=None, linkage='complete', affinity="euclidean", return_distance=False): @@ -323,13 +323,15 @@ def linkage_tree(X, connectivity=None, n_components=None, limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. - linkage : {"average", "complete"}, optional, default: "complete" + linkage : {"average", "complete", "single"}, optional, default: "complete" Which linkage criteria to use. The linkage criterion determines which distance to use between sets of observation. - average uses the average of the distances of each observation of the two sets - complete or maximum linkage uses the maximum distances between all observations of the two sets. + - single uses the minimum of the distances between all observations + of the two sets. affinity : string or callable, optional, default: "euclidean". which metric to use. Can be "euclidean", "manhattan", or any @@ -665,7 +667,7 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin): when varying the number of clusters and using caching, it may be advantageous to compute the full tree. - linkage : {"ward", "complete", "average"}, optional, default: "ward" + linkage : {"ward", "complete", "average", "single"}, optional, default: "ward" Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion. @@ -675,6 +677,8 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin): the two sets. - complete or maximum linkage uses the maximum distances between all observations of the two sets. + - single uses the minimum of the distances between all observations + of the two sets. pooling_func : callable, default=np.mean This combines the values of agglomerated features into a single @@ -835,7 +839,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): when varying the number of clusters and using caching, it may be advantageous to compute the full tree. - linkage : {"ward", "complete", "average"}, optional, default "ward" + linkage : {"ward", "complete", "average", "single"}, optional, default "ward" Which linkage criterion to use. The linkage criterion determines which distance to use between sets of features. The algorithm will merge the pairs of cluster that minimize this criterion. @@ -845,6 +849,8 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): the two sets. - complete or maximum linkage uses the maximum distances between all features of the two sets. + - single uses the minimum of the distances between all observations + of the two sets. pooling_func : callable, default np.mean This combines the values of agglomerated features into a single From c84496f10859a7aaffdc5e47e893341930022315 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 15:35:42 -0500 Subject: [PATCH 10/53] Fix the parents array when we don't get the "full tree" --- sklearn/cluster/hierarchical.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 225e35f10215e..67151f14fe794 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -466,14 +466,14 @@ def linkage_tree(X, connectivity=None, n_components=None, children_ = single_linkage_tree[:, :2].astype(np.int) # Compute parents - parent = np.zeros(n_nodes, dtype=np.intp) - for i, (left, right) in enumerate(children_): - if n_clusters is not None and i >= n_samples - n_clusters: + parent = np.arange(n_nodes, dtype=np.intp) + for i, (left, right) in enumerate(children_, n_samples): + if n_clusters is not None and i >= n_nodes: break if left < n_nodes: - parent[left] = i + n_samples + parent[left] = i if right < n_nodes: - parent[right] = i + n_samples + parent[right] = i if return_distance: distances = single_linkage_tree[:, 2] @@ -787,8 +787,9 @@ def fit(self, X, y=None): n_clusters=n_clusters, **kwargs) # Cut the tree + # if compute_full_tree or self.linkage == 'single': if compute_full_tree: - self.labels_ = _hc_cut(self.n_clusters, self.children_, + self.labels_ = _hc_cut(self.n_clusters, self.children_, self.n_leaves_) else: labels = _hierarchical.hc_get_heads(parents, copy=False) From 8b291adf7cd797fd19314f54fafb1508a906ea59 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 15:39:27 -0500 Subject: [PATCH 11/53] Add single linkage to agglomerative clustering example. --- examples/cluster/plot_agglomerative_clustering.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/cluster/plot_agglomerative_clustering.py b/examples/cluster/plot_agglomerative_clustering.py index dfb27d17d1a89..a8a49283a8474 100644 --- a/examples/cluster/plot_agglomerative_clustering.py +++ b/examples/cluster/plot_agglomerative_clustering.py @@ -52,7 +52,10 @@ for connectivity in (None, knn_graph): for n_clusters in (30, 3): plt.figure(figsize=(10, 4)) - for index, linkage in enumerate(('average', 'complete', 'ward')): + for index, linkage in enumerate(('average', + 'complete', + 'ward', + 'single')): plt.subplot(1, 3, index + 1) model = AgglomerativeClustering(linkage=linkage, connectivity=connectivity, From fc9779256f7919be6f2600a89ca32f595a6048ba Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 15:40:50 -0500 Subject: [PATCH 12/53] Add single linkage to digits agglomerative clustering example. --- examples/cluster/plot_digits_linkage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index f1fe1783c10e5..6aca808be270d 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -79,7 +79,7 @@ def plot_clustering(X_red, X, labels, title=None): from sklearn.cluster import AgglomerativeClustering -for linkage in ('ward', 'average', 'complete'): +for linkage in ('ward', 'average', 'complete', 'single'): clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10) t0 = time() clustering.fit(X_red) From b187fb56b0767557102f6bcc34a5133a864bec3a Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 15:45:59 -0500 Subject: [PATCH 13/53] Update documentation to reflect the addition of single linkage. --- doc/modules/clustering.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 7189474752005..7ea6dd1cb0313 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -556,10 +556,10 @@ considers at each step all the possible merges. number of features. It is a dimensionality reduction tool, see :ref:`data_reduction`. -Different linkage type: Ward, complete and average linkage +Different linkage type: Ward, complete, average and single linkage ----------------------------------------------------------- -:class:`AgglomerativeClustering` supports Ward, average, and complete +:class:`AgglomerativeClustering` supports Ward, single, average, and complete linkage strategies. .. image:: ../auto_examples/cluster/images/sphx_glr_plot_digits_linkage_001.png @@ -579,7 +579,9 @@ Agglomerative cluster has a "rich get richer" behavior that leads to uneven cluster sizes. In this regard, complete linkage is the worst strategy, and Ward gives the most regular sizes. However, the affinity (or distance used in clustering) cannot be varied with Ward, thus for non -Euclidean metrics, average linkage is a good alternative. +Euclidean metrics, average linkage is a good alternative. Single linkage, +while not robust to noisy data, can computed very efficiently and can +therefore be useful to provide hierarchical clustering of larger datasets. .. topic:: Examples: @@ -671,7 +673,7 @@ enable only merging of neighboring pixels on an image, as in the Varying the metric ------------------- -Average and complete linkage can be used with a variety of distances (or +Single, average and complete linkage can be used with a variety of distances (or affinities), in particular Euclidean distance (*l2*), Manhattan distance (or Cityblock, or *l1*), cosine distance, or any precomputed affinity matrix. From aa50b0734ec048421f6c6d252174dba3fb57bd5d Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 16:04:13 -0500 Subject: [PATCH 14/53] Update documentation to reflect the addition of single linkage. --- doc/modules/clustering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 7ea6dd1cb0313..60e68a1827bf2 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -580,7 +580,7 @@ uneven cluster sizes. In this regard, complete linkage is the worst strategy, and Ward gives the most regular sizes. However, the affinity (or distance used in clustering) cannot be varied with Ward, thus for non Euclidean metrics, average linkage is a good alternative. Single linkage, -while not robust to noisy data, can computed very efficiently and can +while not robust to noisy data, can be computed very efficiently and can therefore be useful to provide hierarchical clustering of larger datasets. .. topic:: Examples: From 5d838bc0d1bdbd4d18388021d80f4a2fbe48c9e8 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 16:06:16 -0500 Subject: [PATCH 15/53] Pep8 fix for class declaration in cython --- sklearn/cluster/_hierarchical.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index 151a412f461b6..3ee67ed3b00e9 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -336,7 +336,7 @@ cdef class WeightedEdge: ################################################################################ # Efficient labelling/conversion of MSTs to single linkage hierarchies -cdef class UnionFind (object): +cdef class UnionFind(object): cdef np.ndarray parent_arr cdef np.ndarray size_arr From b5ba3407b2b3a74d2b5296a932c230f1e4ffec3d Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 16:07:17 -0500 Subject: [PATCH 16/53] Fix heading in clustering docs --- doc/modules/clustering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 60e68a1827bf2..1a23b9f3de890 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -557,7 +557,7 @@ considers at each step all the possible merges. :ref:`data_reduction`. Different linkage type: Ward, complete, average and single linkage ------------------------------------------------------------ +------------------------------------------------------------------ :class:`AgglomerativeClustering` supports Ward, single, average, and complete linkage strategies. From 67e63a193a2638cde96474f3795c976dc5a5bc5c Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 16:51:01 -0500 Subject: [PATCH 17/53] Update the digits clustering text to reflect the new reality. --- examples/cluster/plot_digits_linkage.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index 6aca808be270d..d08842d74a374 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -12,8 +12,10 @@ What this example shows us is the behavior "rich getting richer" of agglomerative clustering that tends to create uneven cluster sizes. -This behavior is especially pronounced for the average linkage strategy, -that ends up with a couple of singleton clusters. +This behavior is pronounced for the average linkage strategy, +that ends up with a couple of singleton clusters, while in the case +of single linkage we get a single central cluster with all other clusters +being drawn from noise points around the fringes. """ # Authors: Gael Varoquaux From 73b8f4c108f8a83cd5525eafe160bda769f00c16 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 16:53:17 -0500 Subject: [PATCH 18/53] Provide a more complete comparison of the different linkage methods, highlighting the relative strengths and weaknesses. --- examples/cluster/plot_linkage_comparison.py | 159 ++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 examples/cluster/plot_linkage_comparison.py diff --git a/examples/cluster/plot_linkage_comparison.py b/examples/cluster/plot_linkage_comparison.py new file mode 100644 index 0000000000000..2b7f5cb419650 --- /dev/null +++ b/examples/cluster/plot_linkage_comparison.py @@ -0,0 +1,159 @@ +""" +================================================================ +Comparing different hierarchical linkage methods on toy datasets +================================================================ + +This example shows characteristics of different linkage +methods for hierarchical clustering on datasets that are +"interesting" but still in 2D. + +The main observations to make are: +* single linkage is fast, and can perform well on +non-globular data, but it performs poorly in the +presence of noise. +* average and complete linkage perform well on +cleanly separated globular clusters, but have mixed +results otherwise. +* Ward is the most effective method for noisy data. + +While these examples give some intuition about the +algorithms, this intuition might not apply to very high +dimensional data. +""" +print(__doc__) + +import time +import warnings + +import numpy as np +import matplotlib.pyplot as plt + +from sklearn import cluster, datasets, mixture +from sklearn.neighbors import kneighbors_graph +from sklearn.preprocessing import StandardScaler +from itertools import cycle, islice + +np.random.seed(0) + +# ============ +# Generate datasets. We choose the size big enough to see the scalability +# of the algorithms, but not too big to avoid too long running times +# ============ +n_samples = 1500 +noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, + noise=.05) +noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) +blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) +no_structure = np.random.rand(n_samples, 2), None + +# Anisotropicly distributed data +random_state = 170 +X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state) +transformation = [[0.6, -0.6], [-0.4, 0.8]] +X_aniso = np.dot(X, transformation) +aniso = (X_aniso, y) + +# blobs with varied variances +varied = datasets.make_blobs(n_samples=n_samples, + cluster_std=[1.0, 2.5, 0.5], + random_state=random_state) + +# ============ +# Set up cluster parameters +# ============ +plt.figure(figsize=(9 * 2 + 3, 12.5)) +plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, + hspace=.01) + +plot_num = 1 + +default_base = {'n_neighbors': 10, + 'n_clusters': 3} + +datasets = [ + (noisy_circles, {'n_clusters': 2}), + (noisy_moons, {'n_clusters': 2}), + (varied, {'n_neighbors': 2}), + (aniso, {'n_neighbors': 2}), + (blobs, {}), + (no_structure, {})] + +for i_dataset, (dataset, algo_params) in enumerate(datasets): + # update parameters with dataset-specific values + params = default_base.copy() + params.update(algo_params) + + X, y = dataset + + # normalize dataset for easier parameter selection + X = StandardScaler().fit_transform(X) + + # connectivity matrix for structured Ward + connectivity = kneighbors_graph( + X, n_neighbors=params['n_neighbors'], include_self=False) + # make connectivity symmetric + connectivity = 0.5 * (connectivity + connectivity.T) + + # ============ + # Create cluster objects + # ============ + ward = cluster.AgglomerativeClustering( + n_clusters=params['n_clusters'], linkage='ward') + complete = cluster.AgglomerativeClustering( + n_clusters=params['n_clusters'], linkage='complete') + average = cluster.AgglomerativeClustering( + n_clusters=params['n_clusters'], linkage='average') + single = cluster.AgglomerativeClustering( + n_clusters=params['n_clusters'], linkage='single') + + clustering_algorithms = ( + ('Single Linkage', single), + ('Average Linkage', average), + ('Complete Linkage', complete), + ('Ward Linkage', ward), + ) + + for name, algorithm in clustering_algorithms: + t0 = time.time() + + # catch warnings related to kneighbors_graph + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="the number of connected components of the " + + "connectivity matrix is [0-9]{1,2}" + + " > 1. Completing it to avoid stopping the tree early.", + category=UserWarning) + warnings.filterwarnings( + "ignore", + message="Graph is not fully connected, spectral embedding" + + " may not work as expected.", + category=UserWarning) + algorithm.fit(X) + + t1 = time.time() + if hasattr(algorithm, 'labels_'): + y_pred = algorithm.labels_.astype(np.int) + else: + y_pred = algorithm.predict(X) + + plt.subplot(len(datasets), len(clustering_algorithms), plot_num) + if i_dataset == 0: + plt.title(name, size=18) + + colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', + '#f781bf', '#a65628', '#984ea3', + '#999999', '#e41a1c', '#dede00']), + int(max(y_pred) + 1)))) + plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) + + plt.xlim(-2.5, 2.5) + plt.ylim(-2.5, 2.5) + plt.xticks(()) + plt.yticks(()) + plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), + transform=plt.gca().transAxes, size=15, + horizontalalignment='right') + plot_num += 1 + +plt.show() From 2895849725e4bac202b4eef7518bf362e490ce50 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 16:55:31 -0500 Subject: [PATCH 19/53] We don't need connectivity here, and we can ignore issues with warnings for spectral clustering. --- examples/cluster/plot_linkage_comparison.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/examples/cluster/plot_linkage_comparison.py b/examples/cluster/plot_linkage_comparison.py index 2b7f5cb419650..100ea83e1ecce 100644 --- a/examples/cluster/plot_linkage_comparison.py +++ b/examples/cluster/plot_linkage_comparison.py @@ -88,12 +88,6 @@ # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) - # connectivity matrix for structured Ward - connectivity = kneighbors_graph( - X, n_neighbors=params['n_neighbors'], include_self=False) - # make connectivity symmetric - connectivity = 0.5 * (connectivity + connectivity.T) - # ============ # Create cluster objects # ============ @@ -124,11 +118,6 @@ "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) - warnings.filterwarnings( - "ignore", - message="Graph is not fully connected, spectral embedding" + - " may not work as expected.", - category=UserWarning) algorithm.fit(X) t1 = time.time() From 3fc770fae05f751c5e04edf9a32d450ef9ae8c5d Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 17:07:35 -0500 Subject: [PATCH 20/53] Add an explicit test that single linkage successfully works on examples it should perform well on. --- sklearn/cluster/tests/test_hierarchical.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 7cacc6418acae..25f196996fcd7 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -34,6 +34,7 @@ from sklearn.utils.fast_dict import IntFloatDict from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_warns +from sklearn.datasets import make_moons, make_circles def test_linkage_misc(): @@ -233,6 +234,21 @@ def test_ward_agglomeration(): # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0]) +def test_single_linkage_clustering(): + # Check that we get the correct result in two emblematic cases + moons, moon_labels = make_moons(noise=0.05, random_state=42) + clustering = AgglomerativeClustering(n_clusters=2, linkage='single') + clustering.fit(moons) + assert_almost_equal(normalized_mutual_info_score(clustering.labels_, + moon_labels), 1) + + circles, circle_labels = make_circles(factor=0.5, noise=0.025, + random_state=42) + clustering = AgglomerativeClustering(n_clusters=2, linkage='single') + clustering.fit(circles) + assert_almost_equal(normalized_mutual_info_score(clustering.labels_, + circle_labels), 1) + def assess_same_labelling(cut1, cut2): """Util for comparison with scipy""" From c83c896fb20d7f58c3f15c508b6ffe0a0d65c7ca Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 17:15:53 -0500 Subject: [PATCH 21/53] Update docs with a more complete comparison on linkage methods (scale to be determined?) --- doc/modules/clustering.rst | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 1a23b9f3de890..df915ffe7c36c 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -562,26 +562,18 @@ Different linkage type: Ward, complete, average and single linkage :class:`AgglomerativeClustering` supports Ward, single, average, and complete linkage strategies. -.. image:: ../auto_examples/cluster/images/sphx_glr_plot_digits_linkage_001.png - :target: ../auto_examples/cluster/plot_digits_linkage.html +.. image:: ../auto_examples/cluster/images/sphx_glr_plot_linkage_comparison_001.png + :target: ../auto_examples/cluster/plot_linkage_comparison.html :scale: 43 -.. image:: ../auto_examples/cluster/images/sphx_glr_plot_digits_linkage_002.png - :target: ../auto_examples/cluster/plot_digits_linkage.html - :scale: 43 - -.. image:: ../auto_examples/cluster/images/sphx_glr_plot_digits_linkage_003.png - :target: ../auto_examples/cluster/plot_digits_linkage.html - :scale: 43 - - Agglomerative cluster has a "rich get richer" behavior that leads to -uneven cluster sizes. In this regard, complete linkage is the worst +uneven cluster sizes. In this regard, single linkage is the worst strategy, and Ward gives the most regular sizes. However, the affinity (or distance used in clustering) cannot be varied with Ward, thus for non Euclidean metrics, average linkage is a good alternative. Single linkage, while not robust to noisy data, can be computed very efficiently and can therefore be useful to provide hierarchical clustering of larger datasets. +Single linkage can also perform well on non-globular data. .. topic:: Examples: From e9234be701ac439f86a0ca6ed63a8c5ac4b8b96a Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 17:23:35 -0500 Subject: [PATCH 22/53] List formatting in example linkage comparison. --- examples/cluster/plot_linkage_comparison.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/cluster/plot_linkage_comparison.py b/examples/cluster/plot_linkage_comparison.py index 100ea83e1ecce..005c27ab94214 100644 --- a/examples/cluster/plot_linkage_comparison.py +++ b/examples/cluster/plot_linkage_comparison.py @@ -8,13 +8,14 @@ "interesting" but still in 2D. The main observations to make are: -* single linkage is fast, and can perform well on -non-globular data, but it performs poorly in the -presence of noise. -* average and complete linkage perform well on -cleanly separated globular clusters, but have mixed -results otherwise. -* Ward is the most effective method for noisy data. + +- single linkage is fast, and can perform well on + non-globular data, but it performs poorly in the + presence of noise. +- average and complete linkage perform well on + cleanly separated globular clusters, but have mixed + results otherwise. +- Ward is the most effective method for noisy data. While these examples give some intuition about the algorithms, this intuition might not apply to very high From 3e1017ed2c77350dbbbcd3f7c3773fcb5e2b0a27 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 22:01:06 -0500 Subject: [PATCH 23/53] Flake8 fixes. --- sklearn/cluster/hierarchical.py | 8 +++++--- sklearn/cluster/tests/test_hierarchical.py | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 67151f14fe794..9a63dc26af341 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -377,7 +377,7 @@ def linkage_tree(X, connectivity=None, n_components=None, linkage_choices = {'complete': _hierarchical.max_merge, 'average': _hierarchical.average_merge, - 'single' : None} # Single linkage is handled differently + 'single': None} # Single linkage is handled differently try: join_func = linkage_choices[linkage] except KeyError: @@ -459,7 +459,7 @@ def linkage_tree(X, connectivity=None, n_components=None, mst_array = np.vstack(nonzeros + (nonzero_vals,)).T # Sort edges of the min_spanning_tree by weight - mst_array = mst_array[np.argsort(mst_array.T[2]),:] + mst_array = mst_array[np.argsort(mst_array.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = _hierarchical.single_linkage_label(mst_array) @@ -562,6 +562,7 @@ def _average_linkage(*args, **kwargs): kwargs['linkage'] = 'average' return linkage_tree(*args, **kwargs) + def _single_linkage(*args, **kwargs): kwargs['linkage'] = 'single' return linkage_tree(*args, **kwargs) @@ -667,7 +668,8 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin): when varying the number of clusters and using caching, it may be advantageous to compute the full tree. - linkage : {"ward", "complete", "average", "single"}, optional, default: "ward" + linkage : {"ward", "complete", "average", "single"}, optional \ + (default="ward") Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion. diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 25f196996fcd7..d524fa78d11da 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -234,6 +234,7 @@ def test_ward_agglomeration(): # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0]) + def test_single_linkage_clustering(): # Check that we get the correct result in two emblematic cases moons, moon_labels = make_moons(noise=0.05, random_state=42) From 9ec7534a612f925bb79f2833dd1b8784b17eb91c Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 22:25:38 -0500 Subject: [PATCH 24/53] Flake8 fixes. --- sklearn/cluster/hierarchical.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 9a63dc26af341..2571f43e48451 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -842,7 +842,8 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): when varying the number of clusters and using caching, it may be advantageous to compute the full tree. - linkage : {"ward", "complete", "average", "single"}, optional, default "ward" + linkage : {"ward", "complete", "average", "single"}, optional\ + (default="ward") Which linkage criterion to use. The linkage criterion determines which distance to use between sets of features. The algorithm will merge the pairs of cluster that minimize this criterion. From f5b9077ec2f383127607e6f36d50a96e774e1adb Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sat, 15 Jul 2017 22:52:08 -0500 Subject: [PATCH 25/53] More Flake8 fixes. --- examples/cluster/plot_linkage_comparison.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/cluster/plot_linkage_comparison.py b/examples/cluster/plot_linkage_comparison.py index 005c27ab94214..c5637050d39cf 100644 --- a/examples/cluster/plot_linkage_comparison.py +++ b/examples/cluster/plot_linkage_comparison.py @@ -29,8 +29,7 @@ import numpy as np import matplotlib.pyplot as plt -from sklearn import cluster, datasets, mixture -from sklearn.neighbors import kneighbors_graph +from sklearn import cluster, datasets from sklearn.preprocessing import StandardScaler from itertools import cycle, islice From 345ddd75d3f44bb32bbe43fef2c16230b14e5df8 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sun, 16 Jul 2017 08:34:24 -0500 Subject: [PATCH 26/53] Fix agglomerative plot example with correct subplot spec --- examples/cluster/plot_agglomerative_clustering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_agglomerative_clustering.py b/examples/cluster/plot_agglomerative_clustering.py index a8a49283a8474..16f5902979188 100644 --- a/examples/cluster/plot_agglomerative_clustering.py +++ b/examples/cluster/plot_agglomerative_clustering.py @@ -56,7 +56,7 @@ 'complete', 'ward', 'single')): - plt.subplot(1, 3, index + 1) + plt.subplot(1, 4, index + 1) model = AgglomerativeClustering(linkage=linkage, connectivity=connectivity, n_clusters=n_clusters) From d0f709b52c6b8cd8b0df2e3fda7c643e15efb6f8 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sun, 16 Jul 2017 11:27:13 -0500 Subject: [PATCH 27/53] Explicitly test linkages (including single) produce results identical to scipy.cluster.hierarchical --- sklearn/cluster/tests/test_hierarchical.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index d524fa78d11da..227171e1dc0a6 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -282,6 +282,12 @@ def test_scikit_vs_scipy(): children_ = out[:, :2].astype(np.int) children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity) + # Sort the order of of child nodes per row to match scipy + children.sort(axis=1) + assert_array_equal(children, children_, 'linkage tree differs' + ' from scipy impl for' + ' linkage: ' + linkage) + cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) From 3eed3244ead8343345959130b9ed5eba21e27782 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Sun, 16 Jul 2017 12:00:47 -0500 Subject: [PATCH 28/53] Fix comment on why we sort (consistency) --- sklearn/cluster/tests/test_hierarchical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 227171e1dc0a6..571986169f130 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -282,7 +282,7 @@ def test_scikit_vs_scipy(): children_ = out[:, :2].astype(np.int) children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity) - # Sort the order of of child nodes per row to match scipy + # Sort the order of of child nodes per row for consistency children.sort(axis=1) assert_array_equal(children, children_, 'linkage tree differs' ' from scipy impl for' From 55f4d7267bea6b27556469b20690461c1a9f6e17 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 22 Nov 2017 19:45:57 -0500 Subject: [PATCH 29/53] Fix indentation issue on line 799 --- sklearn/cluster/hierarchical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 9d3c35e53de48..84336acacd844 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -796,7 +796,7 @@ def fit(self, X, y=None): # Cut the tree # if compute_full_tree or self.linkage == 'single': if compute_full_tree: - self.labels_ = _hc_cut(self.n_clusters, self.children_, + self.labels_ = _hc_cut(self.n_clusters, self.children_, self.n_leaves_) else: labels = _hierarchical.hc_get_heads(parents, copy=False) From d6d6e65f9473f617c11f1bd8e28da55310f9e497 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 22 Nov 2017 19:52:46 -0500 Subject: [PATCH 30/53] Docstring for single_linkage_label --- sklearn/cluster/_hierarchical.pyx | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index 3ee67ed3b00e9..75dda39416f62 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -380,6 +380,24 @@ cdef class UnionFind(object): @cython.nonecheck(False) cpdef np.ndarray[DTYPE_t, ndim=2] single_linkage_label( np.ndarray[DTYPE_t, ndim=2] L): + """ + Convert an linkage array or MST to a tree by labelling clusters at merges. + This is done by using a Union find structure to keep track of merges + efficiently. + + Parameters + ---------- + + L: array of shape (n_samples - 1, 3) + The linkage array or MST where each row specifies two samples + to be merged and a distance or weight at which the merge occurs. This + array is assumed to be sorted by the distance/weight. + + Returns + ------- + + A tree in the format used by scipy.cluster.hierarchy. + """ cdef np.ndarray[DTYPE_t, ndim=2] result_arr cdef DTYPE_t[:, ::1] result From a0613ebe0bee7b3f1010ff240d3b88d5057db6f1 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Mon, 27 Nov 2017 21:09:30 -0500 Subject: [PATCH 31/53] Various fixes for jnothman's detailed comments. --- sklearn/cluster/_hierarchical.pyx | 41 +++++++++--------- sklearn/cluster/hierarchical.py | 71 +++++++++++++++++-------------- 2 files changed, 59 insertions(+), 53 deletions(-) diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index 75dda39416f62..5d15fa5511838 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -338,18 +338,16 @@ cdef class WeightedEdge: cdef class UnionFind(object): - cdef np.ndarray parent_arr cdef np.ndarray size_arr cdef ITYPE_t next_label - cdef ITYPE_t *parent + cdef ITYPE_t[:] parent cdef ITYPE_t *size def __init__(self, N): - self.parent_arr = -1 * np.ones(2 * N - 1, dtype=ITYPE, order='C') + self.parent = -1 * np.ones(2 * N - 1, dtype=ITYPE, order='C') self.next_label = N self.size_arr = np.hstack((np.ones(N, dtype=ITYPE), np.zeros(N-1, dtype=ITYPE))) - self.parent = ( self.parent_arr.data) self.size = ( self.size_arr.data) @cython.boundscheck(False) @@ -368,11 +366,12 @@ cdef class UnionFind(object): cdef ITYPE_t fast_find(self, ITYPE_t n): cdef ITYPE_t p p = n - while self.parent_arr[n] != -1: - n = self.parent_arr[n] - # label up to the root - while self.parent_arr[p] != n: - p, self.parent_arr[p] = self.parent_arr[p], n + # find the highest node in the linkage graph so far + while self.parent[n] != -1: + n = self.parent[n] + # provide a shortcut up to the highest node + while self.parent[p] != n: + p, self.parent[p] = self.parent[p], n return n @@ -387,7 +386,6 @@ cpdef np.ndarray[DTYPE_t, ndim=2] single_linkage_label( Parameters ---------- - L: array of shape (n_samples - 1, 3) The linkage array or MST where each row specifies two samples to be merged and a distance or weight at which the merge occurs. This @@ -395,35 +393,34 @@ cpdef np.ndarray[DTYPE_t, ndim=2] single_linkage_label( Returns ------- - A tree in the format used by scipy.cluster.hierarchy. """ cdef np.ndarray[DTYPE_t, ndim=2] result_arr cdef DTYPE_t[:, ::1] result - cdef ITYPE_t N, a, aa, b, bb, index + cdef ITYPE_t left, left_cluster, right, right_cluster, index cdef DTYPE_t delta - result_arr = np.zeros((L.shape[0], L.shape[1] + 1), dtype=DTYPE) + result_arr = np.zeros((L.shape[0], 4), dtype=DTYPE) result = ( ( result_arr.data)) - N = L.shape[0] + 1 - U = UnionFind(N) + U = UnionFind(L.shape[0] + 1) for index in range(L.shape[0]): - a = L[index, 0] - b = L[index, 1] + left = L[index, 0] + right = L[index, 1] delta = L[index, 2] - aa, bb = U.fast_find(a), U.fast_find(b) + left_cluster = U.fast_find(left) + right_cluster = U.fast_find(right) - result[index][0] = aa - result[index][1] = bb + result[index][0] = left_cluster + result[index][1] = right_cluster result[index][2] = delta - result[index][3] = U.size[aa] + U.size[bb] + result[index][3] = U.size[left_cluster] + U.size[right_cluster] - U.union(aa, bb) + U.union(left_cluster, right_cluster) return result_arr diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 84336acacd844..734e49dc47f1d 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -80,6 +80,44 @@ def _fix_connectivity(X, connectivity, affinity): return connectivity, n_components +def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, + n_components, return_distance): + """ + Perform single linkage clustering on sparse data via the minimum + spanning tree from scipy.sparse.csgraph, then using union-find to label. + The parent array is then generated by walking through the tree. + """ + from scipy.sparse.csgraph import minimum_spanning_tree + + # Use scipy.sparse.csgraph to generate a minimum spanning tree + mst = minimum_spanning_tree(connectivity.tocsr()) + + # Convert the graph to scipy.cluster.hierarchy array format + mst = mst.tocoo() + mst_array = np.vstack([mst.row, mst.col, mst.data]).T + + # Sort edges of the min_spanning_tree by weight + mst_array = mst_array[np.argsort(mst_array.T[2]), :] + + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = _hierarchical.single_linkage_label(mst_array) + children_ = single_linkage_tree[:, :2].astype(np.int) + + # Compute parents + parent = np.arange(n_nodes, dtype=np.intp) + for i, (left, right) in enumerate(children_, n_samples): + if n_clusters is not None and i >= n_nodes: + break + if left < n_nodes: + parent[left] = i + if right < n_nodes: + parent[right] = i + + if return_distance: + distances = single_linkage_tree[:, 2] + return children_, n_components, n_samples, parent, distances + return children_, n_components, n_samples, parent + ############################################################################### # Hierarchical tree building functions @@ -453,37 +491,8 @@ def linkage_tree(X, connectivity=None, n_components='deprecated', n_nodes = 2 * n_samples - n_clusters if linkage == 'single': - from scipy.sparse.csgraph import minimum_spanning_tree - - # Use scipy.sparse.csgraph to generate a minimum spanning tree - mst = minimum_spanning_tree(connectivity.tocsr()) - - # Convert the graph to scipy cluster array format - nonzeros = mst.nonzero() - nonzero_vals = mst.data - mst_array = np.vstack(nonzeros + (nonzero_vals,)).T - - # Sort edges of the min_spanning_tree by weight - mst_array = mst_array[np.argsort(mst_array.T[2]), :] - - # Convert edge list into standard hierarchical clustering format - single_linkage_tree = _hierarchical.single_linkage_label(mst_array) - children_ = single_linkage_tree[:, :2].astype(np.int) - - # Compute parents - parent = np.arange(n_nodes, dtype=np.intp) - for i, (left, right) in enumerate(children_, n_samples): - if n_clusters is not None and i >= n_nodes: - break - if left < n_nodes: - parent[left] = i - if right < n_nodes: - parent[right] = i - - if return_distance: - distances = single_linkage_tree[:, 2] - return children_, n_components, n_samples, parent, distances - return children_, n_components, n_samples, parent + return _single_linkage_tree(connectivity, n_samples, n_nodes, + n_clusters, n_components, return_distance) if return_distance: distances = np.empty(n_nodes - n_samples) From 6f8af804d2b80b498eb3abfefb30fe687a1f4809 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Mon, 27 Nov 2017 21:21:28 -0500 Subject: [PATCH 32/53] Further corrections in cython (memoryviews all around in UnionFind) --- sklearn/cluster/_hierarchical.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index 5d15fa5511838..216dc059d68f2 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -338,17 +338,15 @@ cdef class WeightedEdge: cdef class UnionFind(object): - cdef np.ndarray size_arr cdef ITYPE_t next_label cdef ITYPE_t[:] parent - cdef ITYPE_t *size + cdef ITYPE_t[:] size def __init__(self, N): self.parent = -1 * np.ones(2 * N - 1, dtype=ITYPE, order='C') self.next_label = N - self.size_arr = np.hstack((np.ones(N, dtype=ITYPE), - np.zeros(N-1, dtype=ITYPE))) - self.size = ( self.size_arr.data) + self.size = np.hstack((np.ones(N, dtype=ITYPE), + np.zeros(N - 1, dtype=ITYPE))) @cython.boundscheck(False) @cython.nonecheck(False) From 627eed35d9185454f9c563bcadcb28c65a60a10b Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Mon, 27 Nov 2017 21:34:52 -0500 Subject: [PATCH 33/53] Update WhatsNew for single linkage clustering. --- doc/whats_new/v0.20.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index e9c2254bc2220..00fc00076986e 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -64,6 +64,12 @@ Model evaluation ``'balanced_accuracy'`` scorer for binary classification. :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia `. +Clustering + +- :class:`cluster.AgglomerativeClustering` now supports Single Linkage + clustering via ``linkage='single'``. :issue:`9372` by + :user:`Leland McInnes ` and :user:`Steve Astels `. + Enhancements ............ From b737aac5c7e001e4bc809eccd9608e11ee71c526 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Tue, 16 Jan 2018 13:22:53 -0500 Subject: [PATCH 34/53] Address Jake's concerns. --- sklearn/cluster/_hierarchical.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index bbb7d2684d237..857f3ddc0d07a 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -351,7 +351,6 @@ cdef class UnionFind(object): @cython.boundscheck(False) @cython.nonecheck(False) cdef void union(self, ITYPE_t m, ITYPE_t n): - self.size[self.next_label] = self.size[m] + self.size[n] self.parent[m] = self.next_label self.parent[n] = self.next_label self.size[self.next_label] = self.size[m] + self.size[n] @@ -359,7 +358,6 @@ cdef class UnionFind(object): return - @cython.boundscheck(False) @cython.nonecheck(False) cdef ITYPE_t fast_find(self, ITYPE_t n): cdef ITYPE_t p @@ -372,7 +370,6 @@ cdef class UnionFind(object): p, self.parent[p] = self.parent[p], n return n - @cython.boundscheck(False) @cython.nonecheck(False) cpdef np.ndarray[DTYPE_t, ndim=2] single_linkage_label( From 3a8d505edb1a93d0ceb50c31e7ca4c95a8194aed Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Tue, 16 Jan 2018 14:39:17 -0500 Subject: [PATCH 35/53] Handle true zero distances by setting them to "epsilon" distances --- sklearn/cluster/hierarchical.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index bf140b0bd19c2..b59c522d9ef40 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -89,6 +89,10 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, """ from scipy.sparse.csgraph import minimum_spanning_tree + # Ensure zero distances aren't ignored by setting them to "epsilon" + epsilon_value = connectivity.data[connectivity > 0].min() * 1E-8 + connectivity.data[connectivity == 0] = epsilon_value + # Use scipy.sparse.csgraph to generate a minimum spanning tree mst = minimum_spanning_tree(connectivity.tocsr()) From cb3544981601f104f611c6f1fdf536e52fe244df Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Tue, 16 Jan 2018 14:42:54 -0500 Subject: [PATCH 36/53] Missed the memory view direct assignment fix. --- sklearn/cluster/_hierarchical.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index 857f3ddc0d07a..6abdf5471ebfa 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -398,8 +398,7 @@ cpdef np.ndarray[DTYPE_t, ndim=2] single_linkage_label( cdef DTYPE_t delta result_arr = np.zeros((L.shape[0], 4), dtype=DTYPE) - result = ( ( - result_arr.data)) + result = result_arr U = UnionFind(L.shape[0] + 1) for index in range(L.shape[0]): From b9c23e1329217de75e96a722a51e654f58905586 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Tue, 16 Jan 2018 15:39:57 -0500 Subject: [PATCH 37/53] Missed .data in array fancy indexing for epsilon in place of zero values. --- sklearn/cluster/hierarchical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index b59c522d9ef40..934023ffa205e 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -90,8 +90,8 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, from scipy.sparse.csgraph import minimum_spanning_tree # Ensure zero distances aren't ignored by setting them to "epsilon" - epsilon_value = connectivity.data[connectivity > 0].min() * 1E-8 - connectivity.data[connectivity == 0] = epsilon_value + epsilon_value = connectivity.data[connectivity.data > 0].min() * 1E-8 + connectivity.data[connectivity.data == 0] = epsilon_value # Use scipy.sparse.csgraph to generate a minimum spanning tree mst = minimum_spanning_tree(connectivity.tocsr()) From 276d265462b2ecc33887809a62ad0462cda7318f Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Tue, 16 Jan 2018 20:29:13 -0500 Subject: [PATCH 38/53] Add test for identical points messing with sparse linkage clustering. --- sklearn/cluster/tests/test_hierarchical.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 128e99e21ac9b..d60f284087948 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -24,7 +24,7 @@ from sklearn.cluster import ward_tree from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration from sklearn.cluster.hierarchical import (_hc_cut, _TREE_BUILDERS, - linkage_tree) + linkage_tree, _fix_connectivity) from sklearn.feature_extraction.image import grid_to_graph from sklearn.metrics.pairwise import PAIRED_DISTANCES, cosine_distances,\ manhattan_distances, pairwise_distances @@ -309,6 +309,23 @@ def test_scikit_vs_scipy(): # Test error management in _hc_cut assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves) +def test_identical_points(): + # Ensure identical points are handled correctly when using mst with + # a sparse connectivity matrix + X = np.array([[0,0,0][0,0,0],[1,1,1],[1,1,1],[2,2,2],[2,2,2]]) + true_labels = np.array([0,0,1,1,2,2]) + connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) + connectivity = 0.5 * (connectivity + connectivity.T) + connectivity, n_components = _fix_connectivity(X, connectivity) + + clustering = AgglomerativeClustering(n_clusters=3, + linkage='single', + connectivity=connectivity) + clustering.fit(X) + + assert_almost_equal(normalized_mutual_info_score(clustering.labels_, + true_labels), 1) + def test_connectivity_propagation(): # Check that connectivity in the ward tree is propagated correctly during From d33db416c132c887c5e58b22a0a0ce11c8956c40 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Tue, 16 Jan 2018 21:04:49 -0500 Subject: [PATCH 39/53] Missing comma in test data declaration --- sklearn/cluster/tests/test_hierarchical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index d60f284087948..f67a03355f2f9 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -312,7 +312,7 @@ def test_scikit_vs_scipy(): def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix - X = np.array([[0,0,0][0,0,0],[1,1,1],[1,1,1],[2,2,2],[2,2,2]]) + X = np.array([[0,0,0],[0,0,0],[1,1,1],[1,1,1],[2,2,2],[2,2,2]]) true_labels = np.array([0,0,1,1,2,2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) From 7bbaf7f2a92c902ef646437d747e61d71ae8f1ff Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 17 Jan 2018 17:01:38 -0500 Subject: [PATCH 40/53] Correct arguments to _fix_connectivity --- sklearn/cluster/tests/test_hierarchical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index f67a03355f2f9..4f8c2779e2915 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -316,7 +316,7 @@ def test_identical_points(): true_labels = np.array([0,0,1,1,2,2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) - connectivity, n_components = _fix_connectivity(X, connectivity) + connectivity, n_components = _fix_connectivity(X, connectivity, 'euclidean') clustering = AgglomerativeClustering(n_clusters=3, linkage='single', From 1ec7beb3201cd999ad1b146430a91c684bb7cabb Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 17 Jan 2018 18:14:04 -0500 Subject: [PATCH 41/53] Flake8 fixes for new test. --- sklearn/cluster/tests/test_hierarchical.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 4f8c2779e2915..a2bdf6ca93636 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -312,11 +312,15 @@ def test_scikit_vs_scipy(): def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix - X = np.array([[0,0,0],[0,0,0],[1,1,1],[1,1,1],[2,2,2],[2,2,2]]) - true_labels = np.array([0,0,1,1,2,2]) + X = np.array([[0, 0, 0],[0, 0, 0], + [1, 1, 1],[1, 1, 1], + [2, 2, 2],[2, 2, 2]]) + true_labels = np.array([0, 0, 1, 1, 2, 2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) - connectivity, n_components = _fix_connectivity(X, connectivity, 'euclidean') + connectivity, n_components = _fix_connectivity(X, + connectivity, + 'euclidean') clustering = AgglomerativeClustering(n_clusters=3, linkage='single', From cbd9b805ca8f6c891bdbe624d2671d7962e2c2f1 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 17 Jan 2018 18:51:00 -0500 Subject: [PATCH 42/53] More flake8 fixes for new test. --- sklearn/cluster/tests/test_hierarchical.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index a2bdf6ca93636..5c182d3a0b0f4 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -312,9 +312,9 @@ def test_scikit_vs_scipy(): def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix - X = np.array([[0, 0, 0],[0, 0, 0], - [1, 1, 1],[1, 1, 1], - [2, 2, 2],[2, 2, 2]]) + X = np.array([[0, 0, 0], [0, 0, 0], + [1, 1, 1], [1, 1, 1], + [2, 2, 2], [2, 2, 2]]) true_labels = np.array([0, 0, 1, 1, 2, 2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) From 219b2e5b8cd0ae05093791e14a0675aa60dfe79f Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 17 Jan 2018 19:09:57 -0500 Subject: [PATCH 43/53] More flake8 fixes for new test. --- sklearn/cluster/tests/test_hierarchical.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 5c182d3a0b0f4..e7170a8380b99 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -309,6 +309,7 @@ def test_scikit_vs_scipy(): # Test error management in _hc_cut assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves) + def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix From 239e8f8dab9170cff15a6e555dc40bd4bd6b3522 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 17 Jan 2018 22:34:18 -0500 Subject: [PATCH 44/53] Test all the linkage methods for identical point issues --- sklearn/cluster/tests/test_hierarchical.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index e7170a8380b99..3dcc415424cb9 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -323,13 +323,14 @@ def test_identical_points(): connectivity, 'euclidean') - clustering = AgglomerativeClustering(n_clusters=3, - linkage='single', - connectivity=connectivity) - clustering.fit(X) + for linkage in ('single', 'average', 'average', 'ward'): + clustering = AgglomerativeClustering(n_clusters=3, + linkage=linkage, + connectivity=connectivity) + clustering.fit(X) - assert_almost_equal(normalized_mutual_info_score(clustering.labels_, - true_labels), 1) + assert_almost_equal(normalized_mutual_info_score(clustering.labels_, + true_labels), 1) def test_connectivity_propagation(): From 5e4c22db295fb0fbb4d5b150480bcbc4a608d73e Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 17 Jan 2018 22:38:30 -0500 Subject: [PATCH 45/53] Remove comment; fix epsilon values --- sklearn/cluster/hierarchical.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 934023ffa205e..52ac6155ffc72 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -90,7 +90,7 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, from scipy.sparse.csgraph import minimum_spanning_tree # Ensure zero distances aren't ignored by setting them to "epsilon" - epsilon_value = connectivity.data[connectivity.data > 0].min() * 1E-8 + epsilon_value = np.nextafter(0, 1, dtype=connectivity.data.dtype) connectivity.data[connectivity.data == 0] = epsilon_value # Use scipy.sparse.csgraph to generate a minimum spanning tree @@ -98,6 +98,10 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, # Convert the graph to scipy.cluster.hierarchy array format mst = mst.tocoo() + + # Undo the epsilon values + mst.data[mst.data == epsilon_value] = 0 + mst_array = np.vstack([mst.row, mst.col, mst.data]).T # Sort edges of the min_spanning_tree by weight @@ -806,7 +810,6 @@ def fit(self, X, y=None): n_clusters=n_clusters, **kwargs) # Cut the tree - # if compute_full_tree or self.linkage == 'single': if compute_full_tree: self.labels_ = _hc_cut(self.n_clusters, self.children_, self.n_leaves_) From 7aae4112583da559b4e6d2cb080fd04f2671ae86 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 17 Jan 2018 22:49:55 -0500 Subject: [PATCH 46/53] Cast precomputed distances to float64 for consistency --- sklearn/cluster/hierarchical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 52ac6155ffc72..75344efdce4a6 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -483,7 +483,7 @@ def linkage_tree(X, connectivity=None, n_components='deprecated', del diag_mask if affinity == 'precomputed': - distances = X[connectivity.row, connectivity.col] + distances = X[connectivity.row, connectivity.col].astype('float64') else: # FIXME We compute all the distances, while we could have only computed # the "interesting" distances From 3d42400195660f3da304e8c10ea90082dc7a5ba2 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 17 Jan 2018 22:55:08 -0500 Subject: [PATCH 47/53] Turn bounds checking off; add docsting warning. --- sklearn/cluster/_hierarchical.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index 6abdf5471ebfa..f7774d6b9f9bb 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -358,6 +358,7 @@ cdef class UnionFind(object): return + @cython.boundscheck(False) @cython.nonecheck(False) cdef ITYPE_t fast_find(self, ITYPE_t n): cdef ITYPE_t p @@ -386,6 +387,9 @@ cpdef np.ndarray[DTYPE_t, ndim=2] single_linkage_label( to be merged and a distance or weight at which the merge occurs. This array is assumed to be sorted by the distance/weight. + Invalid arrays will potentially cause segfaults. Please validate the + content of arrays prior to passing them to this function. + Returns ------- A tree in the format used by scipy.cluster.hierarchy. From df6b9ce9e6c242640cc318263c66d9f471ac4255 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 17 Jan 2018 22:57:28 -0500 Subject: [PATCH 48/53] Function spacing formatting issue --- sklearn/cluster/hierarchical.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 75344efdce4a6..42886e5aecd0d 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -126,6 +126,7 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, return children_, n_components, n_samples, parent, distances return children_, n_components, n_samples, parent + ############################################################################### # Hierarchical tree building functions From 8e0b38c2070f22eb7e1bbca5d656d82c45640e9a Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Thu, 18 Jan 2018 11:05:39 -0500 Subject: [PATCH 49/53] Make public and private versions of labelling. --- sklearn/cluster/_hierarchical.pyx | 36 ++++++++++++++++++++++++++----- sklearn/cluster/hierarchical.py | 2 +- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index f7774d6b9f9bb..9e7a355f00982 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -373,12 +373,14 @@ cdef class UnionFind(object): @cython.boundscheck(False) @cython.nonecheck(False) -cpdef np.ndarray[DTYPE_t, ndim=2] single_linkage_label( +cpdef np.ndarray[DTYPE_t, ndim=2] _single_linkage_label( np.ndarray[DTYPE_t, ndim=2] L): """ Convert an linkage array or MST to a tree by labelling clusters at merges. This is done by using a Union find structure to keep track of merges - efficiently. + efficiently. This is the private version of the function that assumes that + ``L`` has been properly validated. See ``single_linkage_label`` for the + user facing version of this function. Parameters ---------- @@ -387,9 +389,6 @@ cpdef np.ndarray[DTYPE_t, ndim=2] single_linkage_label( to be merged and a distance or weight at which the merge occurs. This array is assumed to be sorted by the distance/weight. - Invalid arrays will potentially cause segfaults. Please validate the - content of arrays prior to passing them to this function. - Returns ------- A tree in the format used by scipy.cluster.hierarchy. @@ -422,3 +421,30 @@ cpdef np.ndarray[DTYPE_t, ndim=2] single_linkage_label( U.union(left_cluster, right_cluster) return result_arr + + +def single_linkage_label(L): + """ + Convert an linkage array or MST to a tree by labelling clusters at merges. + This is done by using a Union find structure to keep track of merges + efficiently. + + Parameters + ---------- + L: array of shape (n_samples - 1, 3) + The linkage array or MST where each row specifies two samples + to be merged and a distance or weight at which the merge occurs. This + array is assumed to be sorted by the distance/weight. + + Returns + ------- + A tree in the format used by scipy.cluster.hierarchy. + """ + # Validate L + if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1: + raise ValueError("Input MST array is not a validly formatted MST array") + + if not np.all(np.sort(L[:, 2]) == L[:, 2]): + raise ValueError("Input MST array must be sorted by weight") + + return _single_linkage_label(L) \ No newline at end of file diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 42886e5aecd0d..383d2cb88a8a7 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -108,7 +108,7 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, mst_array = mst_array[np.argsort(mst_array.T[2]), :] # Convert edge list into standard hierarchical clustering format - single_linkage_tree = _hierarchical.single_linkage_label(mst_array) + single_linkage_tree = _hierarchical._single_linkage_label(mst_array) children_ = single_linkage_tree[:, :2].astype(np.int) # Compute parents From 5abc614026337a3918c045cb090bff9ba1d8265e Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Thu, 18 Jan 2018 11:42:37 -0500 Subject: [PATCH 50/53] more efficient is sorted check --- sklearn/cluster/_hierarchical.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_hierarchical.pyx b/sklearn/cluster/_hierarchical.pyx index 9e7a355f00982..5d42d84944956 100644 --- a/sklearn/cluster/_hierarchical.pyx +++ b/sklearn/cluster/_hierarchical.pyx @@ -444,7 +444,8 @@ def single_linkage_label(L): if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1: raise ValueError("Input MST array is not a validly formatted MST array") - if not np.all(np.sort(L[:, 2]) == L[:, 2]): + is_sorted = lambda x: np.all(x[:-1] <= x[1:]) + if not is_sorted(L[:, 2]): raise ValueError("Input MST array must be sorted by weight") return _single_linkage_label(L) \ No newline at end of file From 3f73d9845bf9e7b2fb6dc52e0ae188d5fbeeed88 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Thu, 18 Jan 2018 11:59:26 -0500 Subject: [PATCH 51/53] Explicit cast to cover all bases --- sklearn/cluster/hierarchical.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 383d2cb88a8a7..f424f5102019d 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -89,6 +89,9 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, """ from scipy.sparse.csgraph import minimum_spanning_tree + # explicitly cast connectivity to ensure safety + connectivity = connectivity.astype('float64') + # Ensure zero distances aren't ignored by setting them to "epsilon" epsilon_value = np.nextafter(0, 1, dtype=connectivity.data.dtype) connectivity.data[connectivity.data == 0] = epsilon_value From 9bb23557e6a31f5e585b8452d715741e90a29e44 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Fri, 19 Jan 2018 20:16:47 -0500 Subject: [PATCH 52/53] Address various issue in documentation and examples. --- doc/modules/clustering.rst | 7 +++--- .../cluster/plot_agglomerative_clustering.py | 25 ++++++++++--------- examples/cluster/plot_digits_linkage.py | 4 +-- examples/cluster/plot_linkage_comparison.py | 6 ++--- 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index bca8ebcf1c48c..9e74f1f2e4e6e 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -567,7 +567,7 @@ considers at each step all the possible merges. number of features. It is a dimensionality reduction tool, see :ref:`data_reduction`. -Different linkage type: Ward, complete, average and single linkage +Different linkage type: Ward, complete, average, and single linkage ------------------------------------------------------------------ :class:`AgglomerativeClustering` supports Ward, single, average, and complete @@ -646,15 +646,16 @@ enable only merging of neighboring pixels on an image, as in the * :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering.py` -.. warning:: **Connectivity constraints with average and complete linkage** +.. warning:: **Connectivity constraints with single, average and complete linkage** - Connectivity constraints and complete or average linkage can enhance + Connectivity constraints and single, complete or average linkage can enhance the 'rich getting richer' aspect of agglomerative clustering, particularly so if they are built with :func:`sklearn.neighbors.kneighbors_graph`. In the limit of a small number of clusters, they tend to give a few macroscopically occupied clusters and almost empty ones. (see the discussion in :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering.py`). + Single linkage is the most brittle linkage option with regard to this issue. .. image:: ../auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_001.png :target: ../auto_examples/cluster/plot_agglomerative_clustering.html diff --git a/examples/cluster/plot_agglomerative_clustering.py b/examples/cluster/plot_agglomerative_clustering.py index 16f5902979188..1a9cf22c1e5f7 100644 --- a/examples/cluster/plot_agglomerative_clustering.py +++ b/examples/cluster/plot_agglomerative_clustering.py @@ -9,17 +9,18 @@ Two consequences of imposing a connectivity can be seen. First clustering with a connectivity matrix is much faster. -Second, when using a connectivity matrix, average and complete linkage are -unstable and tend to create a few clusters that grow very quickly. Indeed, -average and complete linkage fight this percolation behavior by considering all -the distances between two clusters when merging them. The connectivity -graph breaks this mechanism. This effect is more pronounced for very -sparse graphs (try decreasing the number of neighbors in -kneighbors_graph) and with complete linkage. In particular, having a very -small number of neighbors in the graph, imposes a geometry that is -close to that of single linkage, which is well known to have this -percolation instability. -""" +Second, when using a connectivity matrix, single, average and complete +linkage are unstable and tend to create a few clusters that grow very +quickly. Indeed, average and complete linkage fight this percolation behavior +by considering all the distances between two clusters when merging them ( +while single linkage exaggerates the behaviour by considering only the +shortest distance between clusters). The connectivity graph breaks this +mechanism for average and complete linkage, making them resemble the more +brittle single linkage. This effect is more pronounced for very sparse graphs +(try decreasing the number of neighbors in kneighbors_graph) and with +complete linkage. In particular, having a very small number of neighbors in +the graph, imposes a geometry that is close to that of single linkage, +which is well known to have this percolation instability. """ # Authors: Gael Varoquaux, Nelle Varoquaux # License: BSD 3 clause @@ -65,7 +66,7 @@ elapsed_time = time.time() - t0 plt.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap=plt.cm.spectral) - plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time), + plt.title('linkage=%s\n(time %.2fs)' % (linkage, elapsed_time), fontdict=dict(verticalalignment='top')) plt.axis('equal') plt.axis('off') diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index d08842d74a374..ba69d04eb4957 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -71,7 +71,7 @@ def plot_clustering(X_red, X, labels, title=None): if title is not None: plt.title(title, size=17) plt.axis('off') - plt.tight_layout() + plt.tight_layout(rect=[0, 0.03, 1, 0.95]) #---------------------------------------------------------------------- # 2D embedding of the digits dataset @@ -85,7 +85,7 @@ def plot_clustering(X_red, X, labels, title=None): clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10) t0 = time() clustering.fit(X_red) - print("%s : %.2fs" % (linkage, time() - t0)) + print("%s :\t%.2fs" % (linkage, time() - t0)) plot_clustering(X_red, X, clustering.labels_, "%s linkage" % linkage) diff --git a/examples/cluster/plot_linkage_comparison.py b/examples/cluster/plot_linkage_comparison.py index c5637050d39cf..f8c723bc6c607 100644 --- a/examples/cluster/plot_linkage_comparison.py +++ b/examples/cluster/plot_linkage_comparison.py @@ -35,10 +35,10 @@ np.random.seed(0) -# ============ +###################################################################### # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times -# ============ + n_samples = 1500 noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) @@ -61,7 +61,7 @@ # ============ # Set up cluster parameters # ============ -plt.figure(figsize=(9 * 2 + 3, 12.5)) +plt.figure(figsize=(9 * 2 + 3, 14.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) From 66571ece3cc1b2732f0b1a4dda5e1816e739f2bf Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Mon, 22 Jan 2018 14:07:02 +0100 Subject: [PATCH 53/53] COSMIT: cosmetic changes --- doc/modules/clustering.rst | 2 +- examples/cluster/plot_linkage_comparison.py | 7 ++++--- sklearn/cluster/hierarchical.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 9e74f1f2e4e6e..c992e95ebdaba 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -568,7 +568,7 @@ considers at each step all the possible merges. :ref:`data_reduction`. Different linkage type: Ward, complete, average, and single linkage ------------------------------------------------------------------- +-----------------------------------------------------------------)- :class:`AgglomerativeClustering` supports Ward, single, average, and complete linkage strategies. diff --git a/examples/cluster/plot_linkage_comparison.py b/examples/cluster/plot_linkage_comparison.py index f8c723bc6c607..471132a0f222f 100644 --- a/examples/cluster/plot_linkage_comparison.py +++ b/examples/cluster/plot_linkage_comparison.py @@ -58,10 +58,11 @@ cluster_std=[1.0, 2.5, 0.5], random_state=random_state) -# ============ +###################################################################### +# Run the clustering and plot + # Set up cluster parameters -# ============ -plt.figure(figsize=(9 * 2 + 3, 14.5)) +plt.figure(figsize=(9 * 1.3 + 2, 14.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index f424f5102019d..c462f2f2cda2e 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -779,7 +779,7 @@ def fit(self, X, y=None): (self.affinity, )) if self.linkage not in _TREE_BUILDERS: - raise ValueError("Unknown linkage type %s." + raise ValueError("Unknown linkage type %s. " "Valid options are %s" % (self.linkage, _TREE_BUILDERS.keys())) tree_builder = _TREE_BUILDERS[self.linkage]