1
1
# -*- coding: utf-8 -*-
2
2
"""Algorithms for spectral clustering"""
3
3
4
- # Author: Gael Varoquaux [email protected]
4
+ # Author: Gael Varoquaux < [email protected] >
5
5
# Brian Cheung
6
6
7
+ # Andrew Knyazev <[email protected] >
7
8
# License: BSD 3 clause
8
9
import warnings
9
10
10
11
import numpy as np
11
12
13
+ from scipy .linalg import LinAlgError , qr , svd
14
+ from scipy .sparse import csc_matrix
15
+
12
16
from ..base import BaseEstimator , ClusterMixin
13
17
from ..utils import check_random_state , as_float_array
14
18
from ..utils .deprecation import deprecated
18
22
from ._kmeans import k_means
19
23
20
24
25
+ def cluster_qr (vectors ):
26
+ """Find the discrete partition closest to the eigenvector embedding.
27
+
28
+ This implementation was proposed in [1]_.
29
+
30
+ .. versionadded:: 1.1
31
+
32
+ Parameters
33
+ ----------
34
+ vectors : array-like, shape: (n_samples, n_clusters)
35
+ The embedding space of the samples.
36
+
37
+ Returns
38
+ -------
39
+ labels : array of integers, shape: n_samples
40
+ The cluster labels of vectors.
41
+
42
+ References
43
+ ----------
44
+ .. [1] `Simple, direct, and efficient multi-way spectral clustering, 2019
45
+ Anil Damle, Victor Minden, Lexing Ying
46
+ <:doi:`10.1093/imaiai/iay008`>`_
47
+
48
+ """
49
+
50
+ k = vectors .shape [1 ]
51
+ _ , _ , piv = qr (vectors .T , pivoting = True )
52
+ ut , _ , v = svd (vectors [piv [:k ], :].T )
53
+ vectors = abs (np .dot (vectors , np .dot (ut , v .conj ())))
54
+ return vectors .argmax (axis = 1 )
55
+
56
+
21
57
def discretize (
22
58
vectors , * , copy = True , max_svd_restarts = 30 , n_iter_max = 20 , random_state = None
23
59
):
@@ -73,9 +109,6 @@ def discretize(
73
109
74
110
"""
75
111
76
- from scipy .sparse import csc_matrix
77
- from scipy .linalg import LinAlgError
78
-
79
112
random_state = check_random_state (random_state )
80
113
81
114
vectors = as_float_array (vectors , copy = copy )
@@ -200,10 +233,11 @@ def spectral_clustering(
200
233
Number of eigenvectors to use for the spectral embedding
201
234
202
235
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
203
- The eigenvalue decomposition strategy to use. AMG requires pyamg
204
- to be installed. It can be faster on very large, sparse problems,
205
- but may also lead to instabilities. If None, then ``'arpack'`` is
206
- used. See [4]_ for more details regarding `'lobpcg'`.
236
+ The eigenvalue decomposition method. If None then ``'arpack'`` is used.
237
+ See [4]_ for more details regarding ``'lobpcg'``.
238
+ Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional
239
+ Algebraic MultiGrid preconditioning and requires pyamg to be installed.
240
+ It can be faster on very large sparse problems [6]_ and [7]_.
207
241
208
242
random_state : int, RandomState instance, default=None
209
243
A pseudo random number generator used for the initialization
@@ -229,12 +263,19 @@ def spectral_clustering(
229
263
Stopping criterion for eigendecomposition of the Laplacian matrix
230
264
when using arpack eigen_solver.
231
265
232
- assign_labels : {'kmeans', 'discretize'}, default='kmeans'
266
+ assign_labels : {'kmeans', 'discretize', 'cluster_qr' }, default='kmeans'
233
267
The strategy to use to assign labels in the embedding
234
- space. There are two ways to assign labels after the Laplacian
268
+ space. There are three ways to assign labels after the Laplacian
235
269
embedding. k-means can be applied and is a popular choice. But it can
236
270
also be sensitive to initialization. Discretization is another
237
271
approach which is less sensitive to random initialization [3]_.
272
+ The cluster_qr method [5]_ directly extracts clusters from eigenvectors
273
+ in spectral clustering. In contrast to k-means and discretization, cluster_qr
274
+ has no tuning parameters and is not an iterative method, yet may outperform
275
+ k-means and discretization in terms of both quality and speed.
276
+
277
+ .. versionchanged:: 1.1
278
+ Added new labeling method 'cluster_qr'.
238
279
239
280
verbose : bool, default=False
240
281
Verbosity mode.
@@ -262,23 +303,38 @@ def spectral_clustering(
262
303
<https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf>`_
263
304
264
305
.. [4] `Toward the Optimal Preconditioned Eigensolver:
265
- Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001.
306
+ Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
266
307
A. V. Knyazev
267
308
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
268
- <https://epubs.siam.org/doi/pdf/10.1137/S1064827500366124>`_
309
+ <:doi:`10.1137/S1064827500366124`>`_
310
+
311
+ .. [5] `Simple, direct, and efficient multi-way spectral clustering, 2019
312
+ Anil Damle, Victor Minden, Lexing Ying
313
+ <:doi:`10.1093/imaiai/iay008`>`_
314
+
315
+ .. [6] `Multiscale Spectral Image Segmentation Multiscale preconditioning
316
+ for computing eigenvalues of graph Laplacians in image segmentation, 2006
317
+ Andrew Knyazev
318
+ <:doi:`10.13140/RG.2.2.35280.02565`>`_
319
+
320
+ .. [7] `Preconditioned spectral clustering for stochastic block partition
321
+ streaming graph challenge (Preliminary version at arXiv.)
322
+ David Zhuzhunashvili, Andrew Knyazev
323
+ <:doi:`10.1109/HPEC.2017.8091045`>`_
269
324
270
325
Notes
271
326
-----
272
- The graph should contain only one connect component, elsewhere
327
+ The graph should contain only one connected component, elsewhere
273
328
the results make little sense.
274
329
275
330
This algorithm solves the normalized cut for k=2: it is a
276
331
normalized spectral clustering.
277
332
"""
278
- if assign_labels not in ("kmeans" , "discretize" ):
333
+ if assign_labels not in ("kmeans" , "discretize" , "cluster_qr" ):
279
334
raise ValueError (
280
335
"The 'assign_labels' parameter should be "
281
- "'kmeans' or 'discretize', but '%s' was given" % assign_labels
336
+ "'kmeans' or 'discretize', or 'cluster_qr', "
337
+ f"but { assign_labels !r} was given"
282
338
)
283
339
if isinstance (affinity , np .matrix ):
284
340
raise TypeError (
@@ -312,6 +368,8 @@ def spectral_clustering(
312
368
_ , labels , _ = k_means (
313
369
maps , n_clusters , random_state = random_state , n_init = n_init , verbose = verbose
314
370
)
371
+ elif assign_labels == "cluster_qr" :
372
+ labels = cluster_qr (maps )
315
373
else :
316
374
labels = discretize (maps , random_state = random_state )
317
375
@@ -407,12 +465,19 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
407
465
Stopping criterion for eigendecomposition of the Laplacian matrix
408
466
when ``eigen_solver='arpack'``.
409
467
410
- assign_labels : {'kmeans', 'discretize'}, default='kmeans'
468
+ assign_labels : {'kmeans', 'discretize', 'cluster_qr' }, default='kmeans'
411
469
The strategy for assigning labels in the embedding space. There are two
412
470
ways to assign labels after the Laplacian embedding. k-means is a
413
471
popular choice, but it can be sensitive to initialization.
414
472
Discretization is another approach which is less sensitive to random
415
473
initialization [3]_.
474
+ The cluster_qr method [5]_ directly extract clusters from eigenvectors
475
+ in spectral clustering. In contrast to k-means and discretization, cluster_qr
476
+ has no tuning parameters and runs no iterations, yet may outperform
477
+ k-means and discretization in terms of both quality and speed.
478
+
479
+ .. versionchanged:: 1.1
480
+ Added new labeling method 'cluster_qr'.
416
481
417
482
degree : float, default=3
418
483
Degree of the polynomial kernel. Ignored by other kernels.
@@ -502,6 +567,10 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
502
567
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
503
568
<https://epubs.siam.org/doi/pdf/10.1137/S1064827500366124>`_
504
569
570
+ .. [5] `Simple, direct, and efficient multi-way spectral clustering, 2019
571
+ Anil Damle, Victor Minden, Lexing Ying
572
+ <:doi:`10.1093/imaiai/iay008`>`_
573
+
505
574
Examples
506
575
--------
507
576
>>> from sklearn.cluster import SpectralClustering
0 commit comments