diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index ed79304fcbdee..141cda7959d43 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -453,7 +453,6 @@ cluster. This criteria is especially interesting when working on images, where graph vertices are pixels, and weights of the edges of the similarity graph are computed using a function of a gradient of the image. - .. |noisy_img| image:: ../auto_examples/cluster/images/sphx_glr_plot_segmentation_toy_001.png :target: ../auto_examples/cluster/plot_segmentation_toy.html :scale: 50 @@ -493,22 +492,22 @@ computed using a function of a gradient of the image. :target: ../auto_examples/cluster/plot_coin_segmentation.html :scale: 65 +.. |coin_clusrerQR| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_003.png + :target: ../auto_examples/cluster/plot_coin_segmentation.html + :scale: 65 + Different label assignment strategies ------------------------------------- Different label assignment strategies can be used, corresponding to the ``assign_labels`` parameter of :class:`SpectralClustering`. -``"kmeans"`` strategy can match finer details, but can be unstable. + +``"kmeans"`` strategy can match finer details, but it can be unstable. In particular, unless you control the ``random_state``, it may not be reproducible from run-to-run, as it depends on random initialization. -The alternative ``"discretize"`` strategy is 100% reproducible, but tends -to create parcels of fairly even and geometrical shape. - -===================================== ===================================== - ``assign_labels="kmeans"`` ``assign_labels="discretize"`` -===================================== ===================================== -|coin_kmeans| |coin_discretize| -===================================== ===================================== +Alternative ``"discretize"`` strategy is 100% reproducible, but tends +to create parcels of fairly even and geometrical shape. +The recently added option ``clusterQR`` is 100% also reproducible. Spectral Clustering Graphs -------------------------- @@ -540,6 +539,10 @@ graph, and SpectralClustering is initialized with `affinity='precomputed'`:: `_ Andrew Y. Ng, Michael I. Jordan, Yair Weiss, 2001 + * `"Robust and efficient multi-way spectral clustering" + `_ + Anil Damle, Victor Minden, Lexing Ying + * `"Preconditioned Spectral Clustering for Stochastic Block Partition Streaming Graph Challenge" `_ diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index bda1d717b2479..cc606aae2408a 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -10,16 +10,20 @@ This procedure (spectral clustering on an image) is an efficient approximate solution for finding normalized graph cuts. -There are two options to assign labels: +There are three options to assign labels: * with 'kmeans' spectral clustering will cluster samples in the embedding space - using a kmeans algorithm + using a kmeans algorithm, +* with 'clusterQR' will cluster samples in the embedding space + using a clusterQR algorithm, * whereas 'discrete' will iteratively search for the closest partition space to the embedding space. + """ print(__doc__) # Author: Gael Varoquaux , Brian Cheung +# Andrew Knyazev added clusterQR # License: BSD 3 clause import time @@ -62,28 +66,34 @@ eps = 1e-6 graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps -# Apply spectral clustering (this step goes much faster if you have pyamg -# installed) -N_REGIONS = 25 +# The actual number of regions in this example is 27: background and 26 coins +N_REGIONS = 26 ############################################################################# -# Visualize the resulting regions +# Compute and visualize the resulting regions + +# Any eigen_solver: 'arpack', 'lobpcg', 'amg' can be used. AMG is usually best +# It often helps the spectral clustering to compute a few extra eigenvectors +N_REGIONS_PLUS = 3 -for assign_labels in ('kmeans', 'discretize'): +for assign_labels in ('kmeans', 'discretize', 'clusterQR'): t0 = time.time() - labels = spectral_clustering(graph, n_clusters=N_REGIONS, - assign_labels=assign_labels, random_state=42) + labels = spectral_clustering(graph, + n_clusters=(N_REGIONS + N_REGIONS_PLUS), + assign_labels=assign_labels, random_state=42, + eigen_solver='arpack') t1 = time.time() labels = labels.reshape(rescaled_coins.shape) plt.figure(figsize=(5, 5)) - plt.imshow(rescaled_coins, cmap=plt.cm.gray) - for l in range(N_REGIONS): - plt.contour(labels == l, - colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))]) + plt.imshow(rescaled_coins, cmap=plt.get_cmap('gray')) plt.xticks(()) plt.yticks(()) title = 'Spectral clustering: %s, %.2fs' % (assign_labels, (t1 - t0)) print(title) plt.title(title) + for l in range(N_REGIONS): + plt.contour(labels == l, + colors=[plt.cm.nipy_spectral((l+3) / float(N_REGIONS+3))]) + plt.pause(0.5) plt.show() diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 78cdcc5073ccc..8143f3ac88599 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -9,6 +9,8 @@ import numpy as np +from scipy.linalg import qr, svd + from ..base import BaseEstimator, ClusterMixin from ..utils import check_random_state, as_float_array from ..utils.validation import check_array @@ -18,6 +20,36 @@ from ._k_means import k_means +def clusterQR(vectors): + """Search for a partition matrix (clustering) which is + closest to the eigenvector embedding. + + Parameters + ---------- + vectors : array-like, shape: (n_samples, n_clusters) + The embedding space of the samples. + + Returns + ------- + labels : array of integers, shape: n_samples + The labels of the clusters. + + References + ---------- + https://github.com/asdamle/QR-spectral-clustering + https://arxiv.org/abs/1708.07481 + """ + + k = vectors.shape[1] + piv = qr(vectors.T, pivoting=True)[2] + piv = piv[0:k] + UtSV = svd(vectors[piv, :].T) + Ut = UtSV[0] + Vt = UtSV[2].T.conj() + vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T))) + return vectors.argmax(axis=1).T + + def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None): """Search for a partition matrix (clustering) which is closest to the @@ -210,14 +242,18 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. - assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' + assign_labels : {'kmeans', 'discretize', 'clusterQR'}, default: 'kmeans' The strategy to use to assign labels in the embedding - space. There are two ways to assign labels after the laplacian - embedding. k-means can be applied and is a popular choice. But it can + space. There are three ways to assign labels after the laplacian + embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. See the 'Multiclass spectral clustering' paper referenced below for - more details on the discretization approach. + more details on the discretization approach. The newest clusterQR + directly extract clusters from eigenvectors in spectral clustering. + In contrast to k-means and discretization, clusterQR has no tuning + parameters, e.g., runs no iterations, yet may outperform k-means and + discretization in terms of both quality and speed. Returns ------- @@ -247,9 +283,10 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, This algorithm solves the normalized cut for k=2: it is a normalized spectral clustering. """ - if assign_labels not in ('kmeans', 'discretize'): - raise ValueError("The 'assign_labels' parameter should be " - "'kmeans' or 'discretize', but '%s' was given" + if assign_labels not in ('kmeans', 'discretize', 'clusterQR'): + raise ValueError( + "The 'assign_labels' parameter should be " + "'kmeans', 'discretize', or 'clusterQR' but '%s' was given" % assign_labels) random_state = check_random_state(random_state) @@ -266,6 +303,8 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters, random_state=random_state, n_init=n_init) + elif assign_labels == 'clusterQR': + labels = clusterQR(maps) else: labels = discretize(maps, random_state=random_state) @@ -351,7 +390,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Stopping criterion for eigendecomposition of the Laplacian matrix when ``eigen_solver='arpack'``. - assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' + assign_labels : {'kmeans', 'discretize', 'clusterQR'}, default: 'kmeans' The strategy to use to assign labels in the embedding space. There are two ways to assign labels after the laplacian embedding. k-means can be applied and is a popular choice. But it can diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index dc79f427afcdf..4caae77000583 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -28,7 +28,11 @@ @pytest.mark.parametrize('eigen_solver', ('arpack', 'lobpcg')) -@pytest.mark.parametrize('assign_labels', ('kmeans', 'discretize')) +@pytest.mark.parametrize( + 'assign_labels', + ('kmeans', + 'discretize', + 'clusterQR')) def test_spectral_clustering(eigen_solver, assign_labels): S = np.array([[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],