diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 3e22166f690d8..9a83f15713ea9 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -434,21 +434,24 @@ given sample. Spectral clustering =================== -:class:`SpectralClustering` does a low-dimension embedding of the -affinity matrix between samples, followed by a KMeans in the low -dimensional space. It is especially efficient if the affinity matrix is -sparse and the `pyamg `_ module is installed. -SpectralClustering requires the number of clusters to be specified. It -works well for a small number of clusters but is not advised when using -many clusters. - -For two clusters, it solves a convex relaxation of the `normalised -cuts `_ problem on -the similarity graph: cutting the graph in two so that the weight of the -edges cut is small compared to the weights of the edges inside each -cluster. This criteria is especially interesting when working on images: -graph vertices are pixels, and edges of the similarity graph are a -function of the gradient of the image. +:class:`SpectralClustering` performs a low-dimension embedding of the +affinity matrix between samples, followed by clustering, e.g., by KMeans, +of the components of the eigenvectors in the low dimensional space. +It is especially computationally efficient if the affinity matrix is sparse +and the `amg` solver is used for the eigenvalue problem (Note, the `amg` solver +requires that the `pyamg `_ module is installed.) + +The present version of SpectralClustering requires the number of clusters +to be specified in advance. It works well for a small number of clusters, +but is not advised for many clusters. + +For two clusters, SpectralClustering solves a convex relaxation of the +`normalised cuts `_ +problem on the similarity graph: cutting the graph in two so that the weight of +the edges cut is small compared to the weights of the edges inside each +cluster. This criteria is especially interesting when working on images, where +graph vertices are pixels, and weights of the edges of the similarity graph are +computed using a function of a gradient of the image. .. |noisy_img| image:: ../auto_examples/cluster/images/sphx_glr_plot_segmentation_toy_001.png @@ -495,12 +498,11 @@ Different label assignment strategies Different label assignment strategies can be used, corresponding to the ``assign_labels`` parameter of :class:`SpectralClustering`. -The ``"kmeans"`` strategy can match finer details of the data, but it can be -more unstable. In particular, unless you control the ``random_state``, it -may not be reproducible from run-to-run, as it depends on a random -initialization. On the other hand, the ``"discretize"`` strategy is 100% -reproducible, but it tends to create parcels of fairly even and -geometrical shape. +``"kmeans"`` strategy can match finer details, but can be unstable. +In particular, unless you control the ``random_state``, it may not be +reproducible from run-to-run, as it depends on random initialization. +The alternative ``"discretize"`` strategy is 100% reproducible, but tends +to create parcels of fairly even and geometrical shape. ===================================== ===================================== ``assign_labels="kmeans"`` ``assign_labels="discretize"`` @@ -511,7 +513,7 @@ geometrical shape. Spectral Clustering Graphs -------------------------- -Spectral Clustering can also be used to cluster graphs by their spectral +Spectral Clustering can also be used to partition graphs via their spectral embeddings. In this case, the affinity matrix is the adjacency matrix of the graph, and SpectralClustering is initialized with `affinity='precomputed'`:: @@ -538,6 +540,10 @@ graph, and SpectralClustering is initialized with `affinity='precomputed'`:: `_ Andrew Y. Ng, Michael I. Jordan, Yair Weiss, 2001 + * `"Preconditioned Spectral Clustering for Stochastic + Block Partition Streaming Graph Challenge" + `_ + David Zhuzhunashvili, Andrew Knyazev .. _hierarchical_clustering: diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 9034f643ebb55..ce3174218679f 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -231,14 +231,20 @@ Changelog - |Fix| :class:`linear_model.LassoCV` no longer forces ``precompute=False`` when fitting the final model. :pr:`14591` by `Andreas Müller`_. - :mod:`sklearn.manifold` ....................... + - |Fix| Fixed a bug where :func:`manifold.spectral_embedding` (and therefore :class:`manifold.SpectralEmedding` and `clustering.SpectralClustering`) - computed wrong eigenvalues with ``solver='amg'`` when + computed wrong eigenvalues with ``eigen_solver='amg'`` when ``n_samples < 5 * n_components``. :pr:`14647` by `Andreas Müller`_. +- |Fix| Fixed a bug in :func:`manifold.spectral_embedding` used in + :class:`manifold.SpectralEmbedding` and :class:`cluster.spectral.SpectralClustering` + where ``eigen_solver="amg"`` would sometimes result in a LinAlgError. + :issue:`13393` by :user:`Andrew Knyazev ` + :pr:`13707` by :user:`Scott White ` + :mod:`sklearn.metrics` ...................... diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 97103ce1fcc95..9142237fd5042 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -289,11 +289,25 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True) laplacian = _set_diag(laplacian, 1, norm_laplacian) + + # The Laplacian matrix is always singular, having at least one zero + # eigenvalue, corresponding to the trivial eigenvector, which is a + # constant. Using a singular matrix for preconditioning may result in + # random failures in LOBPCG and is not supported by the existing + # theory: + # see https://doi.org/10.1007/s10208-015-9297-1 + # Shift the Laplacian so its diagononal is not all ones. The shift + # does change the eigenpairs however, so we'll feed the shifted + # matrix to the solver and afterward set it back to the original. + diag_shift = 1e-5 * sparse.eye(laplacian.shape[0]) + laplacian += diag_shift ml = smoothed_aggregation_solver(check_array(laplacian, 'csr')) + laplacian -= diag_shift + M = ml.aspreconditioner() X = random_state.rand(laplacian.shape[0], n_components + 1) X[:, 0] = dd.ravel() - _, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, + _, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-5, largest=False) embedding = diffusion_map.T if norm_laplacian: @@ -375,8 +389,7 @@ class SpectralEmbedding(BaseEstimator): eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} The eigenvalue decomposition strategy to use. AMG requires pyamg - to be installed. It can be faster on very large, sparse problems, - but may also lead to instabilities. + to be installed. It can be faster on very large, sparse problems. n_neighbors : int, default : max(n_samples/10 , 1) Number of nearest neighbors for nearest_neighbors graph building. diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 18ebaab3a440c..d9c066c474b1c 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -162,10 +162,7 @@ def test_spectral_embedding_callable_affinity(seed=36): def test_spectral_embedding_amg_solver(seed=36): # Test spectral embedding with amg solver - try: - from pyamg import smoothed_aggregation_solver # noqa - except ImportError: - raise SkipTest("pyamg not available.") + pytest.importorskip('pyamg') se_amg = SpectralEmbedding(n_components=2, affinity="nearest_neighbors", eigen_solver="amg", n_neighbors=5, @@ -193,6 +190,32 @@ def test_spectral_embedding_amg_solver(seed=36): assert _check_with_col_sign_flipping(embed_amg, embed_arpack, 1e-5) +def test_spectral_embedding_amg_solver_failure(seed=36): + # Test spectral embedding with amg solver failure, see issue #13393 + pytest.importorskip('pyamg') + + # The generated graph below is NOT fully connected if n_neighbors=3 + n_samples = 200 + n_clusters = 3 + n_features = 3 + centers = np.eye(n_clusters, n_features) + S, true_labels = make_blobs(n_samples=n_samples, centers=centers, + cluster_std=1., random_state=42) + + se_amg0 = SpectralEmbedding(n_components=3, affinity="nearest_neighbors", + eigen_solver="amg", n_neighbors=3, + random_state=np.random.RandomState(seed)) + embed_amg0 = se_amg0.fit_transform(S) + + for i in range(10): + se_amg0.set_params(random_state=np.random.RandomState(seed + 1)) + embed_amg1 = se_amg0.fit_transform(S) + + assert _check_with_col_sign_flipping(embed_amg0, embed_amg1, 0.05) + + +@pytest.mark.filterwarnings("ignore:the behavior of nmi will " + "change in version 0.22") def test_pipeline_spectral_clustering(seed=36): # Test using pipeline to do spectral clustering random_state = np.random.RandomState(seed)