diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 09637b5d938d1..ac4807e052f66 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -492,11 +492,15 @@ computed using a function of a gradient of the image. .. |coin_kmeans| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_001.png :target: ../auto_examples/cluster/plot_coin_segmentation.html - :scale: 65 + :scale: 35 .. |coin_discretize| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_002.png :target: ../auto_examples/cluster/plot_coin_segmentation.html - :scale: 65 + :scale: 35 + +.. |coin_cluster_qr| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_003.png + :target: ../auto_examples/cluster/plot_coin_segmentation.html + :scale: 35 Different label assignment strategies ------------------------------------- @@ -508,12 +512,24 @@ In particular, unless you control the ``random_state``, it may not be reproducible from run-to-run, as it depends on random initialization. The alternative ``"discretize"`` strategy is 100% reproducible, but tends to create parcels of fairly even and geometrical shape. +The recently added ``"cluster_qr"`` option is a deterministic alternative that +tends to create the visually best partitioning on the example application +below. + +================================ ================================ ================================ + ``assign_labels="kmeans"`` ``assign_labels="discretize"`` ``assign_labels="cluster_qr"`` +================================ ================================ ================================ +|coin_kmeans| |coin_discretize| |coin_cluster_qr| +================================ ================================ ================================ + +.. topic:: References: + + * `"Multiclass spectral clustering" + `_ + Stella X. Yu, Jianbo Shi, 2003 -===================================== ===================================== - ``assign_labels="kmeans"`` ``assign_labels="discretize"`` -===================================== ===================================== -|coin_kmeans| |coin_discretize| -===================================== ===================================== + * :doi:`"Simple, direct, and efficient multi-way spectral clustering"<10.1093/imaiai/iay008>` + Anil Damle, Victor Minden, Lexing Ying, 2019 Spectral Clustering Graphs -------------------------- diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 372f47e0c7c4b..433366ee35fe0 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -49,6 +49,13 @@ Changelog add this information to the plot. :pr:`21038` by :user:`Guillaume Lemaitre `. +- |Enhancement| :class:`cluster.SpectralClustering` and :func:`cluster.spectral` + now include the new `'cluster_qr'` method from :func:`cluster.cluster_qr` + that clusters samples in the embedding space as an alternative to the existing + `'kmeans'` and `'discrete'` methods. + See :func:`cluster.spectral_clustering` for more details. + :pr:`21148` by :user:`Andrew Knyazev ` + :mod:`sklearn.cross_decomposition` .................................. diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 4d83d2bccf639..cf916df3167c2 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -10,16 +10,19 @@ This procedure (spectral clustering on an image) is an efficient approximate solution for finding normalized graph cuts. -There are two options to assign labels: +There are three options to assign labels: -* with 'kmeans' spectral clustering will cluster samples in the embedding space +* 'kmeans' spectral clustering clusters samples in the embedding space using a kmeans algorithm -* whereas 'discrete' will iteratively search for the closest partition - space to the embedding space. - +* 'discrete' iteratively searches for the closest partition + space to the embedding space of spectral clustering. +* 'cluster_qr' assigns labels using the QR factorization with pivoting + that directly determines the partition in the embedding space. """ -# Author: Gael Varoquaux , Brian Cheung +# Author: Gael Varoquaux +# Brian Cheung +# Andrew Knyazev # License: BSD 3 clause import time @@ -61,28 +64,51 @@ eps = 1e-6 graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps -# Apply spectral clustering (this step goes much faster if you have pyamg -# installed) -N_REGIONS = 25 +# The number of segmented regions to display needs to be chosen manually. +# The current version of 'spectral_clustering' does not support determining +# the number of good quality clusters automatically. +n_regions = 26 # %% -# Visualize the resulting regions - -for assign_labels in ("kmeans", "discretize"): +# Compute and visualize the resulting regions + +# Computing a few extra eigenvectors may speed up the eigen_solver. +# The spectral clustering quality may also benetif from requesting +# extra regions for segmentation. +n_regions_plus = 3 + +# Apply spectral clustering using the default eigen_solver='arpack'. +# Any implemented solver can be used: eigen_solver='arpack', 'lobpcg', or 'amg'. +# Choosing eigen_solver='amg' requires an extra package called 'pyamg'. +# The quality of segmentation and the speed of calculations is mostly determined +# by the choice of the solver and the value of the tolerance 'eigen_tol'. +# TODO: varying eigen_tol seems to have no effect for 'lobpcg' and 'amg' #21243. +for assign_labels in ("kmeans", "discretize", "cluster_qr"): t0 = time.time() labels = spectral_clustering( - graph, n_clusters=N_REGIONS, assign_labels=assign_labels, random_state=42 + graph, + n_clusters=(n_regions + n_regions_plus), + eigen_tol=1e-7, + assign_labels=assign_labels, + random_state=42, ) + t1 = time.time() labels = labels.reshape(rescaled_coins.shape) - plt.figure(figsize=(5, 5)) plt.imshow(rescaled_coins, cmap=plt.cm.gray) - for l in range(N_REGIONS): - plt.contour(labels == l, colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))]) + plt.xticks(()) plt.yticks(()) title = "Spectral clustering: %s, %.2fs" % (assign_labels, (t1 - t0)) print(title) plt.title(title) + for l in range(n_regions): + colors = [plt.cm.nipy_spectral((l + 4) / float(n_regions + 4))] + plt.contour(labels == l, colors=colors) + # To view individual segments as appear comment in plt.pause(0.5) plt.show() + +# TODO: After #21194 is merged and #21243 is fixed, check which eigen_solver +# is the best and set eigen_solver='arpack', 'lobpcg', or 'amg' and eigen_tol +# explicitly in this example. diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 8b80f9999b403..f96a11c177c8a 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -1,14 +1,18 @@ # -*- coding: utf-8 -*- """Algorithms for spectral clustering""" -# Author: Gael Varoquaux gael.varoquaux@normalesup.org +# Author: Gael Varoquaux # Brian Cheung # Wei LI +# Andrew Knyazev # License: BSD 3 clause import warnings import numpy as np +from scipy.linalg import LinAlgError, qr, svd +from scipy.sparse import csc_matrix + from ..base import BaseEstimator, ClusterMixin from ..utils import check_random_state, as_float_array from ..utils.deprecation import deprecated @@ -18,6 +22,38 @@ from ._kmeans import k_means +def cluster_qr(vectors): + """Find the discrete partition closest to the eigenvector embedding. + + This implementation was proposed in [1]_. + + .. versionadded:: 1.1 + + Parameters + ---------- + vectors : array-like, shape: (n_samples, n_clusters) + The embedding space of the samples. + + Returns + ------- + labels : array of integers, shape: n_samples + The cluster labels of vectors. + + References + ---------- + .. [1] `Simple, direct, and efficient multi-way spectral clustering, 2019 + Anil Damle, Victor Minden, Lexing Ying + <:doi:`10.1093/imaiai/iay008`>`_ + + """ + + k = vectors.shape[1] + _, _, piv = qr(vectors.T, pivoting=True) + ut, _, v = svd(vectors[piv[:k], :].T) + vectors = abs(np.dot(vectors, np.dot(ut, v.conj()))) + return vectors.argmax(axis=1) + + def discretize( vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None ): @@ -73,9 +109,6 @@ def discretize( """ - from scipy.sparse import csc_matrix - from scipy.linalg import LinAlgError - random_state = check_random_state(random_state) vectors = as_float_array(vectors, copy=copy) @@ -200,10 +233,11 @@ def spectral_clustering( Number of eigenvectors to use for the spectral embedding eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} - The eigenvalue decomposition strategy to use. AMG requires pyamg - to be installed. It can be faster on very large, sparse problems, - but may also lead to instabilities. If None, then ``'arpack'`` is - used. See [4]_ for more details regarding `'lobpcg'`. + The eigenvalue decomposition method. If None then ``'arpack'`` is used. + See [4]_ for more details regarding ``'lobpcg'``. + Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional + Algebraic MultiGrid preconditioning and requires pyamg to be installed. + It can be faster on very large sparse problems [6]_ and [7]_. random_state : int, RandomState instance, default=None A pseudo random number generator used for the initialization @@ -229,12 +263,19 @@ def spectral_clustering( Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. - assign_labels : {'kmeans', 'discretize'}, default='kmeans' + assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans' The strategy to use to assign labels in the embedding - space. There are two ways to assign labels after the Laplacian + space. There are three ways to assign labels after the Laplacian embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization [3]_. + The cluster_qr method [5]_ directly extracts clusters from eigenvectors + in spectral clustering. In contrast to k-means and discretization, cluster_qr + has no tuning parameters and is not an iterative method, yet may outperform + k-means and discretization in terms of both quality and speed. + + .. versionchanged:: 1.1 + Added new labeling method 'cluster_qr'. verbose : bool, default=False Verbosity mode. @@ -262,23 +303,38 @@ def spectral_clustering( `_ .. [4] `Toward the Optimal Preconditioned Eigensolver: - Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001. + Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001 A. V. Knyazev SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. - `_ + <:doi:`10.1137/S1064827500366124`>`_ + + .. [5] `Simple, direct, and efficient multi-way spectral clustering, 2019 + Anil Damle, Victor Minden, Lexing Ying + <:doi:`10.1093/imaiai/iay008`>`_ + + .. [6] `Multiscale Spectral Image Segmentation Multiscale preconditioning + for computing eigenvalues of graph Laplacians in image segmentation, 2006 + Andrew Knyazev + <:doi:`10.13140/RG.2.2.35280.02565`>`_ + + .. [7] `Preconditioned spectral clustering for stochastic block partition + streaming graph challenge (Preliminary version at arXiv.) + David Zhuzhunashvili, Andrew Knyazev + <:doi:`10.1109/HPEC.2017.8091045`>`_ Notes ----- - The graph should contain only one connect component, elsewhere + The graph should contain only one connected component, elsewhere the results make little sense. This algorithm solves the normalized cut for k=2: it is a normalized spectral clustering. """ - if assign_labels not in ("kmeans", "discretize"): + if assign_labels not in ("kmeans", "discretize", "cluster_qr"): raise ValueError( "The 'assign_labels' parameter should be " - "'kmeans' or 'discretize', but '%s' was given" % assign_labels + "'kmeans' or 'discretize', or 'cluster_qr', " + f"but {assign_labels!r} was given" ) if isinstance(affinity, np.matrix): raise TypeError( @@ -312,6 +368,8 @@ def spectral_clustering( _, labels, _ = k_means( maps, n_clusters, random_state=random_state, n_init=n_init, verbose=verbose ) + elif assign_labels == "cluster_qr": + labels = cluster_qr(maps) else: labels = discretize(maps, random_state=random_state) @@ -407,12 +465,19 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Stopping criterion for eigendecomposition of the Laplacian matrix when ``eigen_solver='arpack'``. - assign_labels : {'kmeans', 'discretize'}, default='kmeans' + assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans' The strategy for assigning labels in the embedding space. There are two ways to assign labels after the Laplacian embedding. k-means is a popular choice, but it can be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization [3]_. + The cluster_qr method [5]_ directly extract clusters from eigenvectors + in spectral clustering. In contrast to k-means and discretization, cluster_qr + has no tuning parameters and runs no iterations, yet may outperform + k-means and discretization in terms of both quality and speed. + + .. versionchanged:: 1.1 + Added new labeling method 'cluster_qr'. degree : float, default=3 Degree of the polynomial kernel. Ignored by other kernels. @@ -502,6 +567,10 @@ class SpectralClustering(ClusterMixin, BaseEstimator): SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. `_ + .. [5] `Simple, direct, and efficient multi-way spectral clustering, 2019 + Anil Damle, Victor Minden, Lexing Ying + <:doi:`10.1093/imaiai/iay008`>`_ + Examples -------- >>> from sklearn.cluster import SpectralClustering diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 679adf27520e4..07dd4b64514ac 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -12,7 +12,7 @@ from sklearn.utils._testing import assert_array_equal from sklearn.cluster import SpectralClustering, spectral_clustering -from sklearn.cluster._spectral import discretize +from sklearn.cluster._spectral import discretize, cluster_qr from sklearn.feature_extraction import img_to_graph from sklearn.metrics import pairwise_distances from sklearn.metrics import adjusted_rand_score @@ -29,7 +29,7 @@ @pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg")) -@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize")) +@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) def test_spectral_clustering(eigen_solver, assign_labels): S = np.array( [ @@ -101,7 +101,8 @@ def test_spectral_unknown_assign_labels(): spectral_clustering(S, n_clusters=2, random_state=0, assign_labels="") -def test_spectral_clustering_sparse(): +@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) +def test_spectral_clustering_sparse(assign_labels): X, y = make_blobs( n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01 ) @@ -111,7 +112,12 @@ def test_spectral_clustering_sparse(): S = sparse.coo_matrix(S) labels = ( - SpectralClustering(random_state=0, n_clusters=2, affinity="precomputed") + SpectralClustering( + random_state=0, + n_clusters=2, + affinity="precomputed", + assign_labels=assign_labels, + ) .fit(S) .labels_ ) @@ -191,6 +197,36 @@ def histogram(x, y, **kwargs): sp.fit(X) +def test_cluster_qr(): + # cluster_qr by itself should not be used for clustering generic data + # other than the rows of the eigenvectors within spectral clustering, + # but cluster_qr must still preserve the labels for different dtypes + # of the generic fixed input even if the labels may be meaningless. + random_state = np.random.RandomState(seed=8) + n_samples, n_components = 10, 5 + data = random_state.randn(n_samples, n_components) + labels_float64 = cluster_qr(data.astype(np.float64)) + # Each sample is assigned a cluster identifier + assert labels_float64.shape == (n_samples,) + # All components should be covered by the assignment + assert np.array_equal(np.unique(labels_float64), np.arange(n_components)) + # Single precision data should yield the same cluster assignments + labels_float32 = cluster_qr(data.astype(np.float32)) + assert np.array_equal(labels_float64, labels_float32) + + +def test_cluster_qr_permutation_invariance(): + # cluster_qr must be invariant to sample permutation. + random_state = np.random.RandomState(seed=8) + n_samples, n_components = 100, 5 + data = random_state.randn(n_samples, n_components) + perm = random_state.permutation(n_samples) + assert np.array_equal( + cluster_qr(data)[perm], + cluster_qr(data[perm]), + ) + + @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) def test_discretize(n_samples): # Test the discretize using a noise assignment matrix @@ -283,7 +319,7 @@ def test_n_components(): assert not np.array_equal(labels, labels_diff_ncomp) -@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize")) +@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) def test_verbose(assign_labels, capsys): # Check verbose mode of KMeans for better coverage. X, y = make_blobs(