From c8e8ff26a712b2184349ca09a5c960d1a2f378f3 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Sat, 6 Oct 2018 12:22:31 -0400 Subject: [PATCH 01/36] clusterQR method added to spectral segmentation --- examples/cluster/plot_coin_segmentation.py | 26 ++++++---- sklearn/cluster/spectral.py | 55 ++++++++++++++++++---- sklearn/cluster/tests/test_spectral.py | 8 +++- 3 files changed, 69 insertions(+), 20 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index caff0eb6b4b61..fae1deeb90cc6 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -14,12 +14,15 @@ * with 'kmeans' spectral clustering will cluster samples in the embedding space using a kmeans algorithm +* with 'clusterQR' will cluster samples in the embedding space + using a clusterQR algorithm, * whereas 'discrete' will iteratively search for the closest partition space to the embedding space. """ print(__doc__) # Author: Gael Varoquaux , Brian Cheung +# Andrew Knyazev added clusterQR # License: BSD 3 clause import time @@ -54,28 +57,31 @@ eps = 1e-6 graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps -# Apply spectral clustering (this step goes much faster if you have pyamg -# installed) -N_REGIONS = 25 +# the actual number of regions in this example is 27: background and 26 coins +N_REGIONS = 26 ############################################################################# -# Visualize the resulting regions +# compute and visualize the resulting regions -for assign_labels in ('kmeans', 'discretize'): +# if often helps the spectral clustering to compute a few extra eigenvectors +N_REGIONS_PLUS = 3 + +for assign_labels in ('kmeans', 'discretize', 'clusterQR'): t0 = time.time() - labels = spectral_clustering(graph, n_clusters=N_REGIONS, + labels = spectral_clustering(graph, n_clusters=N_REGIONS+N_REGIONS_PLUS, assign_labels=assign_labels, random_state=42) t1 = time.time() labels = labels.reshape(rescaled_coins.shape) plt.figure(figsize=(5, 5)) - plt.imshow(rescaled_coins, cmap=plt.cm.gray) - for l in range(N_REGIONS): - plt.contour(labels == l, - colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))]) + plt.imshow(rescaled_coins, cmap=plt.get_cmap('gray')) plt.xticks(()) plt.yticks(()) title = 'Spectral clustering: %s, %.2fs' % (assign_labels, (t1 - t0)) print(title) plt.title(title) + for l in range(N_REGIONS): + plt.contour(labels == l, + colors=[plt.cm.nipy_spectral((l+3) / float(N_REGIONS+3))]) + plt.pause(0.5) plt.show() diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 75757ae907717..4aab59e4a2bef 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -4,11 +4,14 @@ # Author: Gael Varoquaux gael.varoquaux@normalesup.org # Brian Cheung # Wei LI +# Modified by Andrew Knyazev to add clusterQR # License: BSD 3 clause import warnings import numpy as np +from scipy.linalg import qr, svd + from ..base import BaseEstimator, ClusterMixin from ..utils import check_random_state, as_float_array from ..utils.validation import check_array @@ -18,6 +21,40 @@ from .k_means_ import k_means +def clusterQR(vectors): + """Search for a partition matrix (clustering) which is + closest to the eigenvector embedding. + + Parameters + ---------- + vectors : array-like, shape: (n_samples, n_clusters) + The embedding space of the samples. + + Returns + ------- + labels : array of integers, shape: n_samples + The labels of the clusters. + + References + ---------- + https://github.com/asdamle/QR-spectral-clustering + https://arxiv.org/abs/1708.07481 + + Notes + ----- + T.conj() allows the vectors to be complex-valued, just in case for future use + + """ + + k = vectors.shape[1] + piv = qr(vectors.T.conj(), pivoting=True)[2] + piv = piv[0:k] + Ut, Vt = svd(vectors[piv, :].T.conj())[0],\ + svd(vectors[piv, :].T.conj())[2].T.conj() + vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T.conj()))) + return (vectors.argmax(axis=1)).T + + def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None): """Search for a partition matrix (clustering) which is closest to the @@ -42,7 +79,6 @@ def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, Determines random number generation for rotation matrix initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. - Returns ------- labels : array of integers, shape: n_samples @@ -210,10 +246,10 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. - assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' + assign_labels : {'kmeans', 'discretize', 'clusterQR'}, default: 'kmeans' The strategy to use to assign labels in the embedding - space. There are two ways to assign labels after the laplacian - embedding. k-means can be applied and is a popular choice. But it can + space. There are two ways to assign labels after the laplacian + embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. See the 'Multiclass spectral clustering' paper referenced below for @@ -247,10 +283,11 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, This algorithm solves the normalized cut for k=2: it is a normalized spectral clustering. """ - if assign_labels not in ('kmeans', 'discretize'): - raise ValueError("The 'assign_labels' parameter should be " - "'kmeans' or 'discretize', but '%s' was given" - % assign_labels) + if assign_labels not in ('kmeans', 'discretize', 'clusterQR'): + raise ValueError( + "The 'assign_labels' parameter should be " + "'kmeans', 'discretize', or 'clusterQR' but '%s' was given" + % assign_labels) random_state = check_random_state(random_state) n_components = n_clusters if n_components is None else n_components @@ -266,6 +303,8 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters, random_state=random_state, n_init=n_init) + elif assign_labels == 'clusterQR': + labels = clusterQR(maps) else: labels = discretize(maps, random_state=random_state) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 0c220e7615e67..2d232ba5e2d94 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -30,7 +30,11 @@ @pytest.mark.parametrize('eigen_solver', ('arpack', 'lobpcg')) -@pytest.mark.parametrize('assign_labels', ('kmeans', 'discretize')) +@pytest.mark.parametrize( + 'assign_labels', + ('kmeans', + 'discretize', + 'clusterQR')) def test_spectral_clustering(eigen_solver, assign_labels): S = np.array([[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], @@ -109,7 +113,7 @@ def test_affinities(): # on OSX and Linux X, y = make_blobs(n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01 - ) + ) # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', random_state=0) From 6f819e7e3de198dca383ae4a0db803249063c278 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Sat, 6 Oct 2018 15:10:25 -0400 Subject: [PATCH 02/36] fix comment line too long --- sklearn/cluster/spectral.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 4aab59e4a2bef..8a5f67c71f9a4 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -42,7 +42,8 @@ def clusterQR(vectors): Notes ----- - T.conj() allows the vectors to be complex-valued, just in case for future use + T.conj() allows the vectors to be complex-valued, + just in case for future use """ From 99027676e8b26c0cbae02798bcead3acd4099398 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Sat, 6 Oct 2018 15:21:16 -0400 Subject: [PATCH 03/36] typo fixed in spectral.py --- sklearn/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 8a5f67c71f9a4..190e1b3725336 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -380,7 +380,7 @@ class SpectralClustering(BaseEstimator, ClusterMixin): Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. - assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' + assign_labels : {'kmeans', 'discretize', 'clusterQR'}, default: 'kmeans' The strategy to use to assign labels in the embedding space. There are two ways to assign labels after the laplacian embedding. k-means can be applied and is a popular choice. But it can From 9fa645751214d5132f5547134b943c03fb608439 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Sat, 6 Oct 2018 16:13:06 -0400 Subject: [PATCH 04/36] spectral.py trailing white space removed --- sklearn/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 190e1b3725336..c17f29ae95176 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -42,7 +42,7 @@ def clusterQR(vectors): Notes ----- - T.conj() allows the vectors to be complex-valued, + T.conj() allows the vectors to be complex-valued, just in case for future use """ From 44daccb4c170fd0bd2f25afd3db432c8fe38b717 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Sat, 6 Oct 2018 17:42:58 -0400 Subject: [PATCH 05/36] typos fixed --- examples/cluster/plot_coin_segmentation.py | 11 ++++++----- sklearn/cluster/spectral.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index fae1deeb90cc6..6493684f54fcb 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -15,7 +15,7 @@ * with 'kmeans' spectral clustering will cluster samples in the embedding space using a kmeans algorithm * with 'clusterQR' will cluster samples in the embedding space - using a clusterQR algorithm, + using a clusterQR algorithm * whereas 'discrete' will iteratively search for the closest partition space to the embedding space. """ @@ -57,18 +57,19 @@ eps = 1e-6 graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps -# the actual number of regions in this example is 27: background and 26 coins +# The actual number of regions in this example is 27: background and 26 coins N_REGIONS = 26 ############################################################################# -# compute and visualize the resulting regions +# Compute and visualize the resulting regions -# if often helps the spectral clustering to compute a few extra eigenvectors +# It often helps the spectral clustering to compute a few extra eigenvectors N_REGIONS_PLUS = 3 for assign_labels in ('kmeans', 'discretize', 'clusterQR'): t0 = time.time() - labels = spectral_clustering(graph, n_clusters=N_REGIONS+N_REGIONS_PLUS, + labels = spectral_clustering(graph, + n_clusters=(N_REGIONS + N_REGIONS_PLUS), assign_labels=assign_labels, random_state=42) t1 = time.time() labels = labels.reshape(rescaled_coins.shape) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index c17f29ae95176..503cfe79b0df1 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -210,7 +210,7 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, Read more in the :ref:`User Guide `. Parameters - ----------- + ---------- affinity : array-like or sparse matrix, shape: (n_samples, n_samples) The affinity matrix describing the relationship of the samples to embed. **Must be symmetric**. From 3ce05d06ad1d97b481a80f04ab652a5e41724aa0 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Sat, 6 Oct 2018 18:34:16 -0400 Subject: [PATCH 06/36] Update doc/modules/clustering.rst --- doc/modules/clustering.rst | 72 +++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 0e474b61a7b99..2fe6dbbb64477 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -436,22 +436,24 @@ given sample. Spectral clustering =================== -:class:`SpectralClustering` does a low-dimension embedding of the -affinity matrix between samples, followed by a KMeans in the low -dimensional space. It is especially efficient if the affinity matrix is -sparse and the `pyamg `_ module is installed. -SpectralClustering requires the number of clusters to be specified. It -works well for a small number of clusters but is not advised when using -many clusters. - -For two clusters, it solves a convex relaxation of the `normalised -cuts `_ problem on -the similarity graph: cutting the graph in two so that the weight of the -edges cut is small compared to the weights of the edges inside each -cluster. This criteria is especially interesting when working on images: -graph vertices are pixels, and edges of the similarity graph are a -function of the gradient of the image. - +:class:`SpectralClustering` performs a low-dimension embedding of the +affinity matrix between samples, followed by clustering, e.g., by KMeans, +of the components of the eigenvecotrs in the low dimensional space. +It is especially computationally efficient if the affinity matrix is sparse +and the solver `amg` us called to solve the eigenvalue problem, assuming that +the `pyamg `_ module is installed. + +The present version of SpectralClustering requires the number of clusters +to be specified in advance. It works well for a small number of clusters, +but is not advised for many clusters. + +For two clusters, SpectralClustering solves a convex relaxation of the +`normalised cuts `_ +problem on the similarity graph: cutting the graph in two so that the weight of +the edges cut is small compared to the weights of the edges inside each +cluster. This criteria is especially interesting when working on images, where +graph vertices are pixels, and weights of the edges of the similarity graph are +computed using a function of a gradient of the image. .. |noisy_img| image:: ../auto_examples/cluster/images/sphx_glr_plot_segmentation_toy_001.png :target: ../auto_examples/cluster/plot_segmentation_toy.html @@ -492,28 +494,32 @@ function of the gradient of the image. :target: ../auto_examples/cluster/plot_coin_segmentation.html :scale: 65 +.. |coin_clusrerQR| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_003.png + :target: ../auto_examples/cluster/plot_coin_segmentation.html + :scale: 65 + Different label assignment strategies ------------------------------------- Different label assignment strategies can be used, corresponding to the ``assign_labels`` parameter of :class:`SpectralClustering`. -The ``"kmeans"`` strategy can match finer details of the data, but it can be -more unstable. In particular, unless you control the ``random_state``, it -may not be reproducible from run-to-run, as it depends on a random -initialization. On the other hand, the ``"discretize"`` strategy is 100% -reproducible, but it tends to create parcels of fairly even and -geometrical shape. - -===================================== ===================================== - ``assign_labels="kmeans"`` ``assign_labels="discretize"`` -===================================== ===================================== -|coin_kmeans| |coin_discretize| -===================================== ===================================== +``"kmeans"`` strategy can match finer details, but it can be unstable. +In particular, unless you control the ``random_state``, it may not be +reproducible from run-to-run, as it depends on random initialization. +Alternative ``"discretize"`` strategy is 100% reproducible, but tends +to create parcels of fairly even and geometrical shape. +The recently added option ``clusterQR`` is 100% also reproducible. + +=========================== =============================== ============================== + ``assign_labels="kmeans"`` |``assign_labels="discretize"`` |``assign_labels="clusterQR"`` +=========================== =============================== ============================== +|coin_kmeans| |coin_discretize| |coin_clusterQR| +=========================== =============================== ============================== Spectral Clustering Graphs -------------------------- -Spectral Clustering can also be used to cluster graphs by their spectral +Spectral Clustering can also be used to partition graphs via their spectral embeddings. In this case, the affinity matrix is the adjacency matrix of the graph, and SpectralClustering is initialized with `affinity='precomputed'`:: @@ -540,6 +546,14 @@ graph, and SpectralClustering is initialized with `affinity='precomputed'`:: `_ Andrew Y. Ng, Michael I. Jordan, Yair Weiss, 2001 + * `"Robust and efficient multi-way spectral clustering" + `_ + Anil Damle, Victor Minden, Lexing Ying + + * `"Preconditioned Spectral Clustering for Stochastic + Block Partition Streaming Graph Challenge" + `_ + David Zhuzhunashvili, Andrew Knyazev .. _hierarchical_clustering: From 5789fb8092885f964ec3c6988631d68b787fad56 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Sat, 6 Oct 2018 19:15:51 -0400 Subject: [PATCH 07/36] formatting/typo --- examples/cluster/plot_coin_segmentation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 6493684f54fcb..8cb06564be9bd 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -10,7 +10,7 @@ This procedure (spectral clustering on an image) is an efficient approximate solution for finding normalized graph cuts. -There are two options to assign labels: +There are three options to assign labels: * with 'kmeans' spectral clustering will cluster samples in the embedding space using a kmeans algorithm @@ -18,6 +18,7 @@ using a clusterQR algorithm * whereas 'discrete' will iteratively search for the closest partition space to the embedding space. + """ print(__doc__) From 9a0638b04ec740a5c2e53d4fc639a6bd9e3e198d Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Sat, 6 Oct 2018 20:31:25 -0400 Subject: [PATCH 08/36] spectral.py typo fixed --- sklearn/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 503cfe79b0df1..3b573427e0dda 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -338,7 +338,7 @@ class SpectralClustering(BaseEstimator, ClusterMixin): Read more in the :ref:`User Guide `. Parameters - ----------- + ---------- n_clusters : integer, optional The dimension of the projection subspace. From b3133062e179072cad2640a3e294110d8f4c1cf4 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Thu, 11 Oct 2018 08:50:07 -0400 Subject: [PATCH 09/36] Update sklearn/cluster/spectral.py --- sklearn/cluster/spectral.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 3b573427e0dda..949a9317b9f29 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -80,6 +80,7 @@ def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, Determines random number generation for rotation matrix initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. + Returns ------- labels : array of integers, shape: n_samples From ac8445fe98e1c738d781e0aad24f8233a2ab15a7 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Mon, 4 Feb 2019 23:18:58 -0500 Subject: [PATCH 10/36] github.com/scikit-learn/scikit-learn/pull/12316#discussion_r251665191 --- sklearn/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 949a9317b9f29..1cd8526f3a8bf 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -53,7 +53,7 @@ def clusterQR(vectors): Ut, Vt = svd(vectors[piv, :].T.conj())[0],\ svd(vectors[piv, :].T.conj())[2].T.conj() vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T.conj()))) - return (vectors.argmax(axis=1)).T + return vectors.argmax(axis=1).T def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, From 06fba54506022c4da77d8933980970600f5f99ce Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Tue, 5 Feb 2019 00:30:13 -0500 Subject: [PATCH 11/36] https://github.com/scikit-learn/scikit-learn/pull/12316#discussion_r251665390 --- sklearn/cluster/spectral.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 1cd8526f3a8bf..58e1b7d7632f6 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -250,12 +250,16 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, assign_labels : {'kmeans', 'discretize', 'clusterQR'}, default: 'kmeans' The strategy to use to assign labels in the embedding - space. There are two ways to assign labels after the laplacian + space. There are three ways to assign labels after the laplacian embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. See the 'Multiclass spectral clustering' paper referenced below for - more details on the discretization approach. + more details on the discretization approach. The newest clusterQR + directly extract clusters from eigenvectors in spectral clustering. + In contrast to k-means and discretization, clusterQR has no tuning + parameters, e.g., runs no iterations, yet may outperform k-means and + discretization in terms of both quality and speed. Returns ------- @@ -277,6 +281,15 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, Stella X. Yu, Jianbo Shi https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf + - Robust and efficient multi-way spectral clustering + Anil Damle, Victor Minden, Lexing Ying + https://github.com/asdamle/QR-spectral-clustering + + - Preconditioned Spectral Clustering for Stochastic Block Partition + Streaming Graph Challenge + David Zhuzhunashvili, Andrew Knyazev + https://arxiv.org/abs/1708.07481 + Notes ------ The graph should contain only one connect component, elsewhere From e8b3b87680baa9eee7090cb59d7cd7fb68800a12 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Tue, 5 Mar 2019 16:00:16 -0500 Subject: [PATCH 12/36] Update spectral.py fix for https://github.com/scikit-learn/scikit-learn/pull/12316#discussion_r251665374 --- sklearn/cluster/spectral.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 58e1b7d7632f6..5510cdb5d59f8 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -50,8 +50,9 @@ def clusterQR(vectors): k = vectors.shape[1] piv = qr(vectors.T.conj(), pivoting=True)[2] piv = piv[0:k] - Ut, Vt = svd(vectors[piv, :].T.conj())[0],\ - svd(vectors[piv, :].T.conj())[2].T.conj() + UtSV = svd(vectors[piv,:].T.conj()) + Ut = UtSV[0] + Vt = UtSV[2].T.conj() vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T.conj()))) return vectors.argmax(axis=1).T From 6b00bbbdc0742fc43b1259558c058688f9f49d6c Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Tue, 5 Mar 2019 16:32:20 -0500 Subject: [PATCH 13/36] removed redundant SVD --- sklearn/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 5510cdb5d59f8..1b583b085d585 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -50,7 +50,7 @@ def clusterQR(vectors): k = vectors.shape[1] piv = qr(vectors.T.conj(), pivoting=True)[2] piv = piv[0:k] - UtSV = svd(vectors[piv,:].T.conj()) + UtSV = svd(vectors[piv, :].T.conj()) Ut = UtSV[0] Vt = UtSV[2].T.conj() vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T.conj()))) From 8cdcda2dfea2841bc2699b90219296020c481532 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Tue, 5 Mar 2019 18:09:50 -0500 Subject: [PATCH 14/36] testing new tol 1e-5 defaults and the laplacian shift in AMG --- sklearn/manifold/spectral_embedding_.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index e387ecec0f4d5..2de79ccaa954f 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -288,11 +288,12 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) laplacian = _set_diag(laplacian, 1, norm_laplacian) + laplacian = laplacian + 1e-5 * sparse.eye(laplacian.shape[0]) ml = smoothed_aggregation_solver(check_array(laplacian, 'csr')) M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() - lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, + lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-5, largest=False) embedding = diffusion_map.T if norm_laplacian: @@ -320,7 +321,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, # doesn't behave well in low dimension X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() - lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15, + lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-5, largest=False, maxiter=2000) embedding = diffusion_map.T[:n_components] if norm_laplacian: From 3b6844d0711feb147fedf50ea8b06d066ff229fe Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Tue, 5 Mar 2019 22:10:59 -0500 Subject: [PATCH 15/36] changed to eigen_solver='amg' --- examples/cluster/plot_coin_segmentation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index fa5da367f278c..c0e5c801ba53d 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -72,6 +72,7 @@ ############################################################################# # Compute and visualize the resulting regions +# Any eigen_solver: 'arpack', 'lobpcg', 'amg' can be used. AMG is usually best # It often helps the spectral clustering to compute a few extra eigenvectors N_REGIONS_PLUS = 3 @@ -79,7 +80,8 @@ t0 = time.time() labels = spectral_clustering(graph, n_clusters=(N_REGIONS + N_REGIONS_PLUS), - assign_labels=assign_labels, random_state=42) + assign_labels=assign_labels, random_state=42, + eigen_solver='amg') t1 = time.time() labels = labels.reshape(rescaled_coins.shape) From e1fd3a058e4822e6bbff9defb393a0f5f2842e12 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Wed, 6 Mar 2019 10:20:01 -0500 Subject: [PATCH 16/36] eigen_solver='amg' is not installed in CircleCI so back to 'arpack' --- examples/cluster/plot_coin_segmentation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index c0e5c801ba53d..cc606aae2408a 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -13,9 +13,9 @@ There are three options to assign labels: * with 'kmeans' spectral clustering will cluster samples in the embedding space - using a kmeans algorithm + using a kmeans algorithm, * with 'clusterQR' will cluster samples in the embedding space - using a clusterQR algorithm + using a clusterQR algorithm, * whereas 'discrete' will iteratively search for the closest partition space to the embedding space. @@ -81,7 +81,7 @@ labels = spectral_clustering(graph, n_clusters=(N_REGIONS + N_REGIONS_PLUS), assign_labels=assign_labels, random_state=42, - eigen_solver='amg') + eigen_solver='arpack') t1 = time.time() labels = labels.reshape(rescaled_coins.shape) From a11d94a754aee6bd4619477e9b424eb03eba30ba Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Wed, 6 Mar 2019 16:39:22 -0500 Subject: [PATCH 17/36] comment updated to include amg and lobpcg, now fixed --- examples/cluster/plot_segmentation_toy.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py index a6980c5f271ef..3c3cbe9d66548 100644 --- a/examples/cluster/plot_segmentation_toy.py +++ b/examples/cluster/plot_segmentation_toy.py @@ -68,11 +68,10 @@ graph = image.img_to_graph(img, mask=mask) # Take a decreasing function of the gradient: we take it weakly -# dependent from the gradient the segmentation is close to a voronoi +# dependent from the gradient the segmentation is close to a voronoi. graph.data = np.exp(-graph.data / graph.data.std()) -# Force the solver to be arpack, since amg is numerically -# unstable on this example +# Any eigen_solver: 'arpack', 'lobpcg', 'amg' can be used. labels = spectral_clustering(graph, n_clusters=4, eigen_solver='arpack') label_im = np.full(mask.shape, -1.) label_im[mask] = labels From 5b7afbc14ff08897b42cadf608360f3a02f203aa Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Wed, 10 Apr 2019 16:10:10 -0400 Subject: [PATCH 18/36] AMG test on NOT fully connected graph --- sklearn/manifold/tests/test_spectral_embedding.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index bc9a718271271..a8ff112af1c5c 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -170,11 +170,19 @@ def test_spectral_embedding_amg_solver(seed=36): except ImportError: raise SkipTest("pyamg not available.") + # The generated graph below is NOT fully connected + n_samples = 200 + n_clusters = 3 + n_features = 3 + centers = np.eye(n_clusters, n_features) + S, true_labels = make_blobs(n_samples=n_samples, centers=centers, + cluster_std=1., random_state=42) + se_amg = SpectralEmbedding(n_components=2, affinity="nearest_neighbors", - eigen_solver="amg", n_neighbors=5, + eigen_solver="amg", n_neighbors=3, random_state=np.random.RandomState(seed)) se_arpack = SpectralEmbedding(n_components=2, affinity="nearest_neighbors", - eigen_solver="arpack", n_neighbors=5, + eigen_solver="arpack", n_neighbors=3, random_state=np.random.RandomState(seed)) embed_amg = se_amg.fit_transform(S) embed_arpack = se_arpack.fit_transform(S) From 4189f7c316d42300e9d99a6eed38f006ad97058f Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Wed, 10 Apr 2019 16:43:40 -0400 Subject: [PATCH 19/36] last changes reversed for error debugging --- sklearn/manifold/tests/test_spectral_embedding.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index a8ff112af1c5c..bc9a718271271 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -170,19 +170,11 @@ def test_spectral_embedding_amg_solver(seed=36): except ImportError: raise SkipTest("pyamg not available.") - # The generated graph below is NOT fully connected - n_samples = 200 - n_clusters = 3 - n_features = 3 - centers = np.eye(n_clusters, n_features) - S, true_labels = make_blobs(n_samples=n_samples, centers=centers, - cluster_std=1., random_state=42) - se_amg = SpectralEmbedding(n_components=2, affinity="nearest_neighbors", - eigen_solver="amg", n_neighbors=3, + eigen_solver="amg", n_neighbors=5, random_state=np.random.RandomState(seed)) se_arpack = SpectralEmbedding(n_components=2, affinity="nearest_neighbors", - eigen_solver="arpack", n_neighbors=3, + eigen_solver="arpack", n_neighbors=5, random_state=np.random.RandomState(seed)) embed_amg = se_amg.fit_transform(S) embed_arpack = se_arpack.fit_transform(S) From 32e4d19902c78e9ba86fe9eeceb1fd6c76d539ca Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Wed, 10 Apr 2019 18:15:47 -0400 Subject: [PATCH 20/36] new AMG example, still connected graph --- sklearn/manifold/tests/test_spectral_embedding.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index bc9a718271271..fc347c4428212 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -170,6 +170,14 @@ def test_spectral_embedding_amg_solver(seed=36): except ImportError: raise SkipTest("pyamg not available.") + # The generated graph below is NOT fully connected if n_neighbors=3 + n_samples = 200 + n_clusters = 3 + n_features = 3 + centers = np.eye(n_clusters, n_features) + S, true_labels = make_blobs(n_samples=n_samples, centers=centers, + cluster_std=1., random_state=42) + se_amg = SpectralEmbedding(n_components=2, affinity="nearest_neighbors", eigen_solver="amg", n_neighbors=5, random_state=np.random.RandomState(seed)) From 035d39549f683dd6306834cf5775557939a7cc98 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Wed, 10 Apr 2019 18:20:33 -0400 Subject: [PATCH 21/36] change n_neighbors=5 to 3 for graph to disconnect --- sklearn/manifold/tests/test_spectral_embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index fc347c4428212..08286c0f108a8 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -179,10 +179,10 @@ def test_spectral_embedding_amg_solver(seed=36): cluster_std=1., random_state=42) se_amg = SpectralEmbedding(n_components=2, affinity="nearest_neighbors", - eigen_solver="amg", n_neighbors=5, + eigen_solver="amg", n_neighbors=3, random_state=np.random.RandomState(seed)) se_arpack = SpectralEmbedding(n_components=2, affinity="nearest_neighbors", - eigen_solver="arpack", n_neighbors=5, + eigen_solver="arpack", n_neighbors=3, random_state=np.random.RandomState(seed)) embed_amg = se_amg.fit_transform(S) embed_arpack = se_arpack.fit_transform(S) From 5f7916380015bea594c16cc58035106794133dc6 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Wed, 10 Apr 2019 19:16:27 -0400 Subject: [PATCH 22/36] reversed changes to get back the original --- sklearn/manifold/tests/test_spectral_embedding.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 08286c0f108a8..bc9a718271271 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -170,19 +170,11 @@ def test_spectral_embedding_amg_solver(seed=36): except ImportError: raise SkipTest("pyamg not available.") - # The generated graph below is NOT fully connected if n_neighbors=3 - n_samples = 200 - n_clusters = 3 - n_features = 3 - centers = np.eye(n_clusters, n_features) - S, true_labels = make_blobs(n_samples=n_samples, centers=centers, - cluster_std=1., random_state=42) - se_amg = SpectralEmbedding(n_components=2, affinity="nearest_neighbors", - eigen_solver="amg", n_neighbors=3, + eigen_solver="amg", n_neighbors=5, random_state=np.random.RandomState(seed)) se_arpack = SpectralEmbedding(n_components=2, affinity="nearest_neighbors", - eigen_solver="arpack", n_neighbors=3, + eigen_solver="arpack", n_neighbors=5, random_state=np.random.RandomState(seed)) embed_amg = se_amg.fit_transform(S) embed_arpack = se_arpack.fit_transform(S) From 4d3b7f9c6eac7baa9c9c1abd7eedb377ef60354e Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Wed, 10 Apr 2019 19:28:55 -0400 Subject: [PATCH 23/36] added new AMG test NOT fully connected graph --- .../manifold/tests/test_spectral_embedding.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index bc9a718271271..8598474a99eb3 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -181,6 +181,32 @@ def test_spectral_embedding_amg_solver(seed=36): assert _check_with_col_sign_flipping(embed_amg, embed_arpack, 0.05) +def test_spectral_embedding_amg_solver_failure(seed=36): + # Test spectral embedding with amg solver failure + try: + from pyamg import smoothed_aggregation_solver # noqa + except ImportError: + raise SkipTest("pyamg not available.") + + # The generated graph below is NOT fully connected if n_neighbors=3 + n_samples = 200 + n_clusters = 3 + n_features = 3 + centers = np.eye(n_clusters, n_features) + S, true_labels = make_blobs(n_samples=n_samples, centers=centers, + cluster_std=1., random_state=42) + + se_amg0 = SpectralEmbedding(n_components=3, affinity="nearest_neighbors", + eigen_solver="amg", n_neighbors=3, + random_state=np.random.RandomState(seed)) + se_amg1 = SpectralEmbedding(n_components=3, affinity="nearest_neighbors", + eigen_solver="amg", n_neighbors=3, + random_state=np.random.RandomState(seed+1)) + embed_amg0 = se_amg0.fit_transform(S) + embed_amg1 = se_amg1.fit_transform(S) + assert _check_with_col_sign_flipping(embed_amg0, embed_amg1, 0.05) + + @pytest.mark.filterwarnings("ignore:the behavior of nmi will " "change in version 0.22") def test_pipeline_spectral_clustering(seed=36): From 1822016a4d2f34c16a12c5b7c56f9b4b705a4bbc Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Wed, 10 Apr 2019 19:46:19 -0400 Subject: [PATCH 24/36] flake8 line under-indented fixes --- sklearn/manifold/tests/test_spectral_embedding.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 8598474a99eb3..d7477d2ed836f 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -197,11 +197,11 @@ def test_spectral_embedding_amg_solver_failure(seed=36): cluster_std=1., random_state=42) se_amg0 = SpectralEmbedding(n_components=3, affinity="nearest_neighbors", - eigen_solver="amg", n_neighbors=3, - random_state=np.random.RandomState(seed)) + eigen_solver="amg", n_neighbors=3, + random_state=np.random.RandomState(seed)) se_amg1 = SpectralEmbedding(n_components=3, affinity="nearest_neighbors", - eigen_solver="amg", n_neighbors=3, - random_state=np.random.RandomState(seed+1)) + eigen_solver="amg", n_neighbors=3, + random_state=np.random.RandomState(seed+1)) embed_amg0 = se_amg0.fit_transform(S) embed_amg1 = se_amg1.fit_transform(S) assert _check_with_col_sign_flipping(embed_amg0, embed_amg1, 0.05) From be2018ddce1ac95640a2906787135656cb67f03f Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 1 Aug 2019 17:50:59 -0400 Subject: [PATCH 25/36] reverse changes that appear in a separate PR --- .../manifold/tests/test_spectral_embedding.py | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 411711cd254e0..bac3b99776bcf 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -179,32 +179,6 @@ def test_spectral_embedding_amg_solver(seed=36): assert _check_with_col_sign_flipping(embed_amg, embed_arpack, 0.05) -def test_spectral_embedding_amg_solver_failure(seed=36): - # Test spectral embedding with amg solver failure - try: - from pyamg import smoothed_aggregation_solver # noqa - except ImportError: - raise SkipTest("pyamg not available.") - - # The generated graph below is NOT fully connected if n_neighbors=3 - n_samples = 200 - n_clusters = 3 - n_features = 3 - centers = np.eye(n_clusters, n_features) - S, true_labels = make_blobs(n_samples=n_samples, centers=centers, - cluster_std=1., random_state=42) - - se_amg0 = SpectralEmbedding(n_components=3, affinity="nearest_neighbors", - eigen_solver="amg", n_neighbors=3, - random_state=np.random.RandomState(seed)) - se_amg1 = SpectralEmbedding(n_components=3, affinity="nearest_neighbors", - eigen_solver="amg", n_neighbors=3, - random_state=np.random.RandomState(seed+1)) - embed_amg0 = se_amg0.fit_transform(S) - embed_amg1 = se_amg1.fit_transform(S) - assert _check_with_col_sign_flipping(embed_amg0, embed_amg1, 0.05) - - @pytest.mark.filterwarnings("ignore:the behavior of nmi will " "change in version 0.22") From f962ed9f681c1bea0dda37addbc85976fa782237 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 1 Aug 2019 17:52:45 -0400 Subject: [PATCH 26/36] reverse changes that appear in a separate PR --- sklearn/manifold/spectral_embedding_.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index bf1e60eee8d18..52b0bb929302d 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -289,7 +289,6 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) laplacian = _set_diag(laplacian, 1, norm_laplacian) - laplacian = laplacian + 1e-5 * sparse.eye(laplacian.shape[0]) ml = smoothed_aggregation_solver(check_array(laplacian, 'csr')) M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) From 3d20676a6218e45a815394c9ab08c60d246fe460 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 1 Aug 2019 18:05:55 -0400 Subject: [PATCH 27/36] remove conj - complex numbers are not neeed --- sklearn/cluster/spectral.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index f5a49ddfde690..795947666b47b 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -39,21 +39,15 @@ def clusterQR(vectors): ---------- https://github.com/asdamle/QR-spectral-clustering https://arxiv.org/abs/1708.07481 - - Notes - ----- - T.conj() allows the vectors to be complex-valued, - just in case for future use - """ k = vectors.shape[1] - piv = qr(vectors.T.conj(), pivoting=True)[2] + piv = qr(vectors.T, pivoting=True)[2] piv = piv[0:k] - UtSV = svd(vectors[piv, :].T.conj()) + UtSV = svd(vectors[piv, :].T) Ut = UtSV[0] Vt = UtSV[2].T.conj() - vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T.conj()))) + vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T))) return vectors.argmax(axis=1).T From f0d6f5c51afebed283a5d66915dd928b0ca6c44f Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 1 Aug 2019 18:22:49 -0400 Subject: [PATCH 28/36] reverse irrelevant change --- sklearn/manifold/tests/test_spectral_embedding.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index bac3b99776bcf..9209f5762d221 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -179,10 +179,6 @@ def test_spectral_embedding_amg_solver(seed=36): assert _check_with_col_sign_flipping(embed_amg, embed_arpack, 0.05) -@pytest.mark.filterwarnings("ignore:the behavior of nmi will " - "change in version 0.22") - - def test_pipeline_spectral_clustering(seed=36): # Test using pipeline to do spectral clustering random_state = np.random.RandomState(seed) From 6dfee889acf560ec42446d5437b83b759fa5e71b Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 1 Aug 2019 18:33:19 -0400 Subject: [PATCH 29/36] reverse changes not relevant to this PR --- sklearn/manifold/spectral_embedding_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 52b0bb929302d..42227db8a72ad 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -293,7 +293,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() - lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-5, + lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, largest=False) embedding = diffusion_map.T if norm_laplacian: @@ -321,7 +321,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, # doesn't behave well in low dimension X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() - lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-5, + lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15, largest=False, maxiter=2000) embedding = diffusion_map.T[:n_components] if norm_laplacian: From ba4a4519eeb479004f739288b1253f456f5c05b9 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 1 Aug 2019 19:00:54 -0400 Subject: [PATCH 30/36] reverse irrelevant changes to this PR --- examples/cluster/plot_segmentation_toy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py index 3c3cbe9d66548..39d07768afb2c 100644 --- a/examples/cluster/plot_segmentation_toy.py +++ b/examples/cluster/plot_segmentation_toy.py @@ -68,10 +68,9 @@ graph = image.img_to_graph(img, mask=mask) # Take a decreasing function of the gradient: we take it weakly -# dependent from the gradient the segmentation is close to a voronoi. +# dependent from the gradient the segmentation is close to a voronoi. graph.data = np.exp(-graph.data / graph.data.std()) -# Any eigen_solver: 'arpack', 'lobpcg', 'amg' can be used. labels = spectral_clustering(graph, n_clusters=4, eigen_solver='arpack') label_im = np.full(mask.shape, -1.) label_im[mask] = labels From 6f9b305a45da503231eb71c9b38f288bdec737f2 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 1 Aug 2019 19:04:24 -0400 Subject: [PATCH 31/36] typo fixed --- examples/cluster/plot_segmentation_toy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py index 39d07768afb2c..e501573c61ae6 100644 --- a/examples/cluster/plot_segmentation_toy.py +++ b/examples/cluster/plot_segmentation_toy.py @@ -68,7 +68,7 @@ graph = image.img_to_graph(img, mask=mask) # Take a decreasing function of the gradient: we take it weakly -# dependent from the gradient the segmentation is close to a voronoi. +# dependent from the gradient the segmentation is close to a voronoi graph.data = np.exp(-graph.data / graph.data.std()) labels = spectral_clustering(graph, n_clusters=4, eigen_solver='arpack') From a9e9260070b4fe2db0c2c1730f6c1701ea7c2fef Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 1 Aug 2019 19:06:21 -0400 Subject: [PATCH 32/36] reverse changes irrelevant to this PR --- examples/cluster/plot_segmentation_toy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py index e501573c61ae6..a6980c5f271ef 100644 --- a/examples/cluster/plot_segmentation_toy.py +++ b/examples/cluster/plot_segmentation_toy.py @@ -71,6 +71,8 @@ # dependent from the gradient the segmentation is close to a voronoi graph.data = np.exp(-graph.data / graph.data.std()) +# Force the solver to be arpack, since amg is numerically +# unstable on this example labels = spectral_clustering(graph, n_clusters=4, eigen_solver='arpack') label_im = np.full(mask.shape, -1.) label_im[mask] = labels From 78a788a306886249b60405be20eaf1d31997caef Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Fri, 29 Nov 2019 11:13:22 -0500 Subject: [PATCH 33/36] trying to fix a conflict --- sklearn/cluster/spectral.py | 582 ------------------------------------ 1 file changed, 582 deletions(-) delete mode 100644 sklearn/cluster/spectral.py diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py deleted file mode 100644 index c17f5fc2bc14a..0000000000000 --- a/sklearn/cluster/spectral.py +++ /dev/null @@ -1,582 +0,0 @@ -# -*- coding: utf-8 -*- -"""Algorithms for spectral clustering""" - -# Author: Gael Varoquaux gael.varoquaux@normalesup.org -# Brian Cheung -# Wei LI -# Modified by Andrew Knyazev to add clusterQR -# License: BSD 3 clause -import warnings - -import numpy as np - -from scipy.linalg import qr, svd - -from ..base import BaseEstimator, ClusterMixin -from ..utils import check_random_state, as_float_array -from ..utils.validation import check_array -from ..metrics.pairwise import pairwise_kernels -from ..neighbors import kneighbors_graph -from ..manifold import spectral_embedding -from .k_means_ import k_means - - -def clusterQR(vectors): - """Search for a partition matrix (clustering) which is - closest to the eigenvector embedding. - - Parameters - ---------- - vectors : array-like, shape: (n_samples, n_clusters) - The embedding space of the samples. - - Returns - ------- - labels : array of integers, shape: n_samples - The labels of the clusters. - - References - ---------- - https://github.com/asdamle/QR-spectral-clustering - https://arxiv.org/abs/1708.07481 - """ - - k = vectors.shape[1] - piv = qr(vectors.T, pivoting=True)[2] - piv = piv[0:k] - UtSV = svd(vectors[piv, :].T) - Ut = UtSV[0] - Vt = UtSV[2].T.conj() - vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T))) - return vectors.argmax(axis=1).T - - -def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, - random_state=None): - """Search for a partition matrix (clustering) which is closest to the - eigenvector embedding. - - Parameters - ---------- - vectors : array-like, shape: (n_samples, n_clusters) - The embedding space of the samples. - - copy : boolean, optional, default: True - Whether to copy vectors, or perform in-place normalization. - - max_svd_restarts : int, optional, default: 30 - Maximum number of attempts to restart SVD if convergence fails - - n_iter_max : int, optional, default: 30 - Maximum number of iterations to attempt in rotation and partition - matrix search if machine precision convergence is not reached - - random_state : int, RandomState instance or None (default) - Determines random number generation for rotation matrix initialization. - Use an int to make the randomness deterministic. - See :term:`Glossary `. - - Returns - ------- - labels : array of integers, shape: n_samples - The labels of the clusters. - - References - ---------- - - - Multiclass spectral clustering, 2003 - Stella X. Yu, Jianbo Shi - https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf - - Notes - ----- - - The eigenvector embedding is used to iteratively search for the - closest discrete partition. First, the eigenvector embedding is - normalized to the space of partition matrices. An optimal discrete - partition matrix closest to this normalized embedding multiplied by - an initial rotation is calculated. Fixing this discrete partition - matrix, an optimal rotation matrix is calculated. These two - calculations are performed until convergence. The discrete partition - matrix is returned as the clustering solution. Used in spectral - clustering, this method tends to be faster and more robust to random - initialization than k-means. - - """ - - from scipy.sparse import csc_matrix - from scipy.linalg import LinAlgError - - random_state = check_random_state(random_state) - - vectors = as_float_array(vectors, copy=copy) - - eps = np.finfo(float).eps - n_samples, n_components = vectors.shape - - # Normalize the eigenvectors to an equal length of a vector of ones. - # Reorient the eigenvectors to point in the negative direction with respect - # to the first element. This may have to do with constraining the - # eigenvectors to lie in a specific quadrant to make the discretization - # search easier. - norm_ones = np.sqrt(n_samples) - for i in range(vectors.shape[1]): - vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) \ - * norm_ones - if vectors[0, i] != 0: - vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i]) - - # Normalize the rows of the eigenvectors. Samples should lie on the unit - # hypersphere centered at the origin. This transforms the samples in the - # embedding space to the space of partition matrices. - vectors = vectors / np.sqrt((vectors ** 2).sum(axis=1))[:, np.newaxis] - - svd_restarts = 0 - has_converged = False - - # If there is an exception we try to randomize and rerun SVD again - # do this max_svd_restarts times. - while (svd_restarts < max_svd_restarts) and not has_converged: - - # Initialize first column of rotation matrix with a row of the - # eigenvectors - rotation = np.zeros((n_components, n_components)) - rotation[:, 0] = vectors[random_state.randint(n_samples), :].T - - # To initialize the rest of the rotation matrix, find the rows - # of the eigenvectors that are as orthogonal to each other as - # possible - c = np.zeros(n_samples) - for j in range(1, n_components): - # Accumulate c to ensure row is as orthogonal as possible to - # previous picks as well as current one - c += np.abs(np.dot(vectors, rotation[:, j - 1])) - rotation[:, j] = vectors[c.argmin(), :].T - - last_objective_value = 0.0 - n_iter = 0 - - while not has_converged: - n_iter += 1 - - t_discrete = np.dot(vectors, rotation) - - labels = t_discrete.argmax(axis=1) - vectors_discrete = csc_matrix( - (np.ones(len(labels)), (np.arange(0, n_samples), labels)), - shape=(n_samples, n_components)) - - t_svd = vectors_discrete.T * vectors - - try: - U, S, Vh = np.linalg.svd(t_svd) - svd_restarts += 1 - except LinAlgError: - print("SVD did not converge, randomizing and trying again") - break - - ncut_value = 2.0 * (n_samples - S.sum()) - if ((abs(ncut_value - last_objective_value) < eps) or - (n_iter > n_iter_max)): - has_converged = True - else: - # otherwise calculate rotation and continue - last_objective_value = ncut_value - rotation = np.dot(Vh.T, U.T) - - if not has_converged: - raise LinAlgError('SVD did not converge') - return labels - - -def spectral_clustering(affinity, n_clusters=8, n_components=None, - eigen_solver=None, random_state=None, n_init=10, - eigen_tol=0.0, assign_labels='kmeans'): - """Apply clustering to a projection of the normalized Laplacian. - - In practice Spectral Clustering is very useful when the structure of - the individual clusters is highly non-convex or more generally when - a measure of the center and spread of the cluster is not a suitable - description of the complete cluster. For instance, when clusters are - nested circles on the 2D plane. - - If affinity is the adjacency matrix of a graph, this method can be - used to find normalized graph cuts. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - affinity : array-like or sparse matrix, shape: (n_samples, n_samples) - The affinity matrix describing the relationship of the samples to - embed. **Must be symmetric**. - - Possible examples: - - adjacency matrix of a graph, - - heat kernel of the pairwise distance matrix of the samples, - - symmetric k-nearest neighbours connectivity matrix of the samples. - - n_clusters : integer, optional - Number of clusters to extract. - - n_components : integer, optional, default is n_clusters - Number of eigen vectors to use for the spectral embedding - - eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} - The eigenvalue decomposition strategy to use. AMG requires pyamg - to be installed. It can be faster on very large, sparse problems, - but may also lead to instabilities - - random_state : int, RandomState instance or None (default) - A pseudo random number generator used for the initialization of the - lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by - the K-Means initialization. Use an int to make the randomness - deterministic. - See :term:`Glossary `. - - n_init : int, optional, default: 10 - Number of time the k-means algorithm will be run with different - centroid seeds. The final results will be the best output of - n_init consecutive runs in terms of inertia. - - eigen_tol : float, optional, default: 0.0 - Stopping criterion for eigendecomposition of the Laplacian matrix - when using arpack eigen_solver. - - assign_labels : {'kmeans', 'discretize', 'clusterQR'}, default: 'kmeans' - The strategy to use to assign labels in the embedding - space. There are three ways to assign labels after the laplacian - embedding. k-means can be applied and is a popular choice. But it can - also be sensitive to initialization. Discretization is another - approach which is less sensitive to random initialization. See - the 'Multiclass spectral clustering' paper referenced below for - more details on the discretization approach. The newest clusterQR - directly extract clusters from eigenvectors in spectral clustering. - In contrast to k-means and discretization, clusterQR has no tuning - parameters, e.g., runs no iterations, yet may outperform k-means and - discretization in terms of both quality and speed. - - Returns - ------- - labels : array of integers, shape: n_samples - The labels of the clusters. - - References - ---------- - - - Normalized cuts and image segmentation, 2000 - Jianbo Shi, Jitendra Malik - http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 - - - A Tutorial on Spectral Clustering, 2007 - Ulrike von Luxburg - http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323 - - - Multiclass spectral clustering, 2003 - Stella X. Yu, Jianbo Shi - https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf - - - Robust and efficient multi-way spectral clustering - Anil Damle, Victor Minden, Lexing Ying - https://github.com/asdamle/QR-spectral-clustering - - - Preconditioned Spectral Clustering for Stochastic Block Partition - Streaming Graph Challenge - David Zhuzhunashvili, Andrew Knyazev - https://arxiv.org/abs/1708.07481 - - Notes - ----- - The graph should contain only one connect component, elsewhere - the results make little sense. - - This algorithm solves the normalized cut for k=2: it is a - normalized spectral clustering. - """ - if assign_labels not in ('kmeans', 'discretize', 'clusterQR'): - raise ValueError( - "The 'assign_labels' parameter should be " - "'kmeans', 'discretize', or 'clusterQR' but '%s' was given" - % assign_labels) - - random_state = check_random_state(random_state) - n_components = n_clusters if n_components is None else n_components - - # The first eigen vector is constant only for fully connected graphs - # and should be kept for spectral clustering (drop_first = False) - # See spectral_embedding documentation. - maps = spectral_embedding(affinity, n_components=n_components, - eigen_solver=eigen_solver, - random_state=random_state, - eigen_tol=eigen_tol, drop_first=False) - - if assign_labels == 'kmeans': - _, labels, _ = k_means(maps, n_clusters, random_state=random_state, - n_init=n_init) - elif assign_labels == 'clusterQR': - labels = clusterQR(maps) - else: - labels = discretize(maps, random_state=random_state) - - return labels - - -class SpectralClustering(BaseEstimator, ClusterMixin): - """Apply clustering to a projection of the normalized Laplacian. - - In practice Spectral Clustering is very useful when the structure of - the individual clusters is highly non-convex or more generally when - a measure of the center and spread of the cluster is not a suitable - description of the complete cluster. For instance when clusters are - nested circles on the 2D plane. - - If affinity is the adjacency matrix of a graph, this method can be - used to find normalized graph cuts. - - When calling ``fit``, an affinity matrix is constructed using either - kernel function such the Gaussian (aka RBF) kernel of the euclidean - distanced ``d(X, X)``:: - - np.exp(-gamma * d(X,X) ** 2) - - or a k-nearest neighbors connectivity matrix. - - Alternatively, using ``precomputed``, a user-provided affinity - matrix can be used. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - n_clusters : integer, optional - The dimension of the projection subspace. - - eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} - The eigenvalue decomposition strategy to use. AMG requires pyamg - to be installed. It can be faster on very large, sparse problems, - but may also lead to instabilities. - - n_components : integer, optional, default=n_clusters - Number of eigen vectors to use for the spectral embedding - - random_state : int, RandomState instance or None (default) - A pseudo random number generator used for the initialization of the - lobpcg eigen vectors decomposition when ``eigen_solver='amg'`` and by - the K-Means initialization. Use an int to make the randomness - deterministic. - See :term:`Glossary `. - - n_init : int, optional, default: 10 - Number of time the k-means algorithm will be run with different - centroid seeds. The final results will be the best output of - n_init consecutive runs in terms of inertia. - - gamma : float, default=1.0 - Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels. - Ignored for ``affinity='nearest_neighbors'``. - - affinity : string, array-like or callable, default 'rbf' - If a string, this may be one of 'nearest_neighbors', 'precomputed', - 'rbf' or one of the kernels supported by - `sklearn.metrics.pairwise_kernels`. - - Only kernels that produce similarity scores (non-negative values that - increase with similarity) should be used. This property is not checked - by the clustering algorithm. - - n_neighbors : integer - Number of neighbors to use when constructing the affinity matrix using - the nearest neighbors method. Ignored for ``affinity='rbf'``. - - eigen_tol : float, optional, default: 0.0 - Stopping criterion for eigendecomposition of the Laplacian matrix - when ``eigen_solver='arpack'``. - - assign_labels : {'kmeans', 'discretize', 'clusterQR'}, default: 'kmeans' - The strategy to use to assign labels in the embedding - space. There are two ways to assign labels after the laplacian - embedding. k-means can be applied and is a popular choice. But it can - also be sensitive to initialization. Discretization is another approach - which is less sensitive to random initialization. - - degree : float, default=3 - Degree of the polynomial kernel. Ignored by other kernels. - - coef0 : float, default=1 - Zero coefficient for polynomial and sigmoid kernels. - Ignored by other kernels. - - kernel_params : dictionary of string to any, optional - Parameters (keyword arguments) and values for kernel passed as - callable object. Ignored by other kernels. - - n_jobs : int or None, optional (default=None) - The number of parallel jobs to run. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - Attributes - ---------- - affinity_matrix_ : array-like, shape (n_samples, n_samples) - Affinity matrix used for clustering. Available only if after calling - ``fit``. - - labels_ : array, shape (n_samples,) - Labels of each point - - Examples - -------- - >>> from sklearn.cluster import SpectralClustering - >>> import numpy as np - >>> X = np.array([[1, 1], [2, 1], [1, 0], - ... [4, 7], [3, 5], [3, 6]]) - >>> clustering = SpectralClustering(n_clusters=2, - ... assign_labels="discretize", - ... random_state=0).fit(X) - >>> clustering.labels_ - array([1, 1, 1, 0, 0, 0]) - >>> clustering - SpectralClustering(assign_labels='discretize', n_clusters=2, - random_state=0) - - Notes - ----- - If you have an affinity matrix, such as a distance matrix, - for which 0 means identical elements, and high values means - very dissimilar elements, it can be transformed in a - similarity matrix that is well suited for the algorithm by - applying the Gaussian (RBF, heat) kernel:: - - np.exp(- dist_matrix ** 2 / (2. * delta ** 2)) - - Where ``delta`` is a free parameter representing the width of the Gaussian - kernel. - - Another alternative is to take a symmetric version of the k - nearest neighbors connectivity matrix of the points. - - If the pyamg package is installed, it is used: this greatly - speeds up computation. - - References - ---------- - - - Normalized cuts and image segmentation, 2000 - Jianbo Shi, Jitendra Malik - http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 - - - A Tutorial on Spectral Clustering, 2007 - Ulrike von Luxburg - http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323 - - - Multiclass spectral clustering, 2003 - Stella X. Yu, Jianbo Shi - https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf - """ - - def __init__(self, n_clusters=8, eigen_solver=None, n_components=None, - random_state=None, n_init=10, gamma=1., affinity='rbf', - n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', - degree=3, coef0=1, kernel_params=None, n_jobs=None): - self.n_clusters = n_clusters - self.eigen_solver = eigen_solver - self.n_components = n_components - self.random_state = random_state - self.n_init = n_init - self.gamma = gamma - self.affinity = affinity - self.n_neighbors = n_neighbors - self.eigen_tol = eigen_tol - self.assign_labels = assign_labels - self.degree = degree - self.coef0 = coef0 - self.kernel_params = kernel_params - self.n_jobs = n_jobs - - def fit(self, X, y=None): - """Perform spectral clustering from features, or affinity matrix. - - Parameters - ---------- - X : array-like or sparse matrix, shape (n_samples, n_features), or \ - array-like, shape (n_samples, n_samples) - Training instances to cluster, or similarities / affinities between - instances if ``affinity='precomputed'``. If a sparse matrix is - provided in a format other than ``csr_matrix``, ``csc_matrix``, - or ``coo_matrix``, it will be converted into a sparse - ``csr_matrix``. - - y : Ignored - Not used, present here for API consistency by convention. - - Returns - ------- - self - - """ - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=np.float64, ensure_min_samples=2) - if X.shape[0] == X.shape[1] and self.affinity != "precomputed": - warnings.warn("The spectral clustering API has changed. ``fit``" - "now constructs an affinity matrix from data. To use" - " a custom affinity matrix, " - "set ``affinity=precomputed``.") - - if self.affinity == 'nearest_neighbors': - connectivity = kneighbors_graph(X, n_neighbors=self.n_neighbors, - include_self=True, - n_jobs=self.n_jobs) - self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) - elif self.affinity == 'precomputed': - self.affinity_matrix_ = X - else: - params = self.kernel_params - if params is None: - params = {} - if not callable(self.affinity): - params['gamma'] = self.gamma - params['degree'] = self.degree - params['coef0'] = self.coef0 - self.affinity_matrix_ = pairwise_kernels(X, metric=self.affinity, - filter_params=True, - **params) - - random_state = check_random_state(self.random_state) - self.labels_ = spectral_clustering(self.affinity_matrix_, - n_clusters=self.n_clusters, - n_components=self.n_components, - eigen_solver=self.eigen_solver, - random_state=random_state, - n_init=self.n_init, - eigen_tol=self.eigen_tol, - assign_labels=self.assign_labels) - return self - - def fit_predict(self, X, y=None): - """Perform spectral clustering from features, or affinity matrix, - and return cluster labels. - - Parameters - ---------- - X : array-like or sparse matrix, shape (n_samples, n_features), or \ - array-like, shape (n_samples, n_samples) - Training instances to cluster, or similarities / affinities between - instances if ``affinity='precomputed'``. If a sparse matrix is - provided in a format other than ``csr_matrix``, ``csc_matrix``, - or ``coo_matrix``, it will be converted into a sparse - ``csr_matrix``. - - y : Ignored - Not used, present here for API consistency by convention. - - Returns - ------- - labels : ndarray, shape (n_samples,) - Cluster labels. - """ - return super().fit_predict(X, y) - - @property - def _pairwise(self): - return self.affinity == "precomputed" From 955699904133d73c57d2246873272d36a85256af Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Fri, 29 Nov 2019 12:10:55 -0500 Subject: [PATCH 34/36] restore the edits --- sklearn/cluster/_spectral.py | 53 ++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 78cdcc5073ccc..72daf8831c50a 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -18,6 +18,36 @@ from ._k_means import k_means +def clusterQR(vectors): + """Search for a partition matrix (clustering) which is + closest to the eigenvector embedding. + + Parameters + ---------- + vectors : array-like, shape: (n_samples, n_clusters) + The embedding space of the samples. + + Returns + ------- + labels : array of integers, shape: n_samples + The labels of the clusters. + + References + ---------- + https://github.com/asdamle/QR-spectral-clustering + https://arxiv.org/abs/1708.07481 + """ + + k = vectors.shape[1] + piv = qr(vectors.T, pivoting=True)[2] + piv = piv[0:k] + UtSV = svd(vectors[piv, :].T) + Ut = UtSV[0] + Vt = UtSV[2].T.conj() + vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T))) + return vectors.argmax(axis=1).T + + def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None): """Search for a partition matrix (clustering) which is closest to the @@ -210,14 +240,18 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. - assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' + assign_labels : {'kmeans', 'discretize', 'clusterQR'}, default: 'kmeans' The strategy to use to assign labels in the embedding - space. There are two ways to assign labels after the laplacian - embedding. k-means can be applied and is a popular choice. But it can + space. There are three ways to assign labels after the laplacian + embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. See the 'Multiclass spectral clustering' paper referenced below for - more details on the discretization approach. + more details on the discretization approach. The newest clusterQR + directly extract clusters from eigenvectors in spectral clustering. + In contrast to k-means and discretization, clusterQR has no tuning + parameters, e.g., runs no iterations, yet may outperform k-means and + discretization in terms of both quality and speed. Returns ------- @@ -247,9 +281,10 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, This algorithm solves the normalized cut for k=2: it is a normalized spectral clustering. """ - if assign_labels not in ('kmeans', 'discretize'): - raise ValueError("The 'assign_labels' parameter should be " - "'kmeans' or 'discretize', but '%s' was given" + if assign_labels not in ('kmeans', 'discretize', 'clusterQR'): + raise ValueError( + "The 'assign_labels' parameter should be " + "'kmeans', 'discretize', or 'clusterQR' but '%s' was given" % assign_labels) random_state = check_random_state(random_state) @@ -266,6 +301,8 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters, random_state=random_state, n_init=n_init) + elif assign_labels == 'clusterQR': + labels = clusterQR(maps) else: labels = discretize(maps, random_state=random_state) @@ -351,7 +388,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Stopping criterion for eigendecomposition of the Laplacian matrix when ``eigen_solver='arpack'``. - assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' + assign_labels : {'kmeans', 'discretize', 'clusterQR'}, default: 'kmeans' The strategy to use to assign labels in the embedding space. There are two ways to assign labels after the laplacian embedding. k-means can be applied and is a popular choice. But it can From 540f937404bcc25d8086f668a9bff02d311547a0 Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Fri, 29 Nov 2019 12:18:35 -0500 Subject: [PATCH 35/36] add imports --- sklearn/cluster/_spectral.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 72daf8831c50a..8143f3ac88599 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -9,6 +9,8 @@ import numpy as np +from scipy.linalg import qr, svd + from ..base import BaseEstimator, ClusterMixin from ..utils import check_random_state, as_float_array from ..utils.validation import check_array From 9bbd667d403eb745a79d1a9e1288b453310fd48c Mon Sep 17 00:00:00 2001 From: lobpcg <42650045+lobpcg@users.noreply.github.com> Date: Fri, 29 Nov 2019 17:53:37 -0500 Subject: [PATCH 36/36] trying to fix rst warnings --- doc/modules/clustering.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 6be6ff011ea92..141cda7959d43 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -509,12 +509,6 @@ Alternative ``"discretize"`` strategy is 100% reproducible, but tends to create parcels of fairly even and geometrical shape. The recently added option ``clusterQR`` is 100% also reproducible. -=========================== =============================== ============================== - ``assign_labels="kmeans"`` |``assign_labels="discretize"`` |``assign_labels="clusterQR"`` -=========================== =============================== ============================== -|coin_kmeans| |coin_discretize| |coin_clusterQR| -=========================== =============================== ============================== - Spectral Clustering Graphs --------------------------