From 67f66b89d22848e3245cc953d0c28e878c0a671b Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 13:28:57 -0400 Subject: [PATCH 01/90] Update _spectral.py re-introducing https://github.com/scikit-learn/scikit-learn/pull/12316 --- sklearn/cluster/_spectral.py | 46 ++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 8b80f9999b403..73c644cb0f352 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -18,6 +18,35 @@ from ._kmeans import k_means +def cluster_qr(vectors): + """Search for a partition matrix (clustering) which is + closest to the eigenvector embedding. + Parameters + ---------- + vectors : array-like, shape: (n_samples, n_clusters) + The embedding space of the samples. + Returns + ------- + labels : array of integers, shape: n_samples + The labels of the clusters. + References + ---------- + https://github.com/asdamle/QR-spectral-clustering + https://arxiv.org/abs/1708.07481 + """ + + from scipy.linalg import qr, svd + + k = vectors.shape[1] + piv = qr(vectors.T, pivoting=True)[2] + piv = piv[0:k] + UtSV = svd(vectors[piv, :].T) + Ut = UtSV[0] + Vt = UtSV[2].T.conj() + vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T))) + return vectors.argmax(axis=1).T + + def discretize( vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None ): @@ -229,12 +258,16 @@ def spectral_clustering( Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. - assign_labels : {'kmeans', 'discretize'}, default='kmeans' + assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans' The strategy to use to assign labels in the embedding - space. There are two ways to assign labels after the Laplacian + space. There are three ways to assign labels after the Laplacian embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization [3]_. + The newest cluster_qr method directly extract clusters from eigenvectors + in spectral clustering. In contrast to k-means and discretization, + cluster_qr has no tuning parameters, e.g., runs no iterations, yet may outperform + k-means and discretization in terms of both quality and speed. verbose : bool, default=False Verbosity mode. @@ -275,10 +308,11 @@ def spectral_clustering( This algorithm solves the normalized cut for k=2: it is a normalized spectral clustering. """ - if assign_labels not in ("kmeans", "discretize"): + if assign_labels not in ("kmeans", "discretize", 'cluster_qr'): raise ValueError( "The 'assign_labels' parameter should be " - "'kmeans' or 'discretize', but '%s' was given" % assign_labels + "'kmeans' or 'discretize', or 'cluster_qr', but '%s' was given" + % assign_labels ) if isinstance(affinity, np.matrix): raise TypeError( @@ -312,6 +346,8 @@ def spectral_clustering( _, labels, _ = k_means( maps, n_clusters, random_state=random_state, n_init=n_init, verbose=verbose ) + elif assign_labels == 'cluster_qr': + labels = cluster_qr(maps) else: labels = discretize(maps, random_state=random_state) @@ -407,7 +443,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Stopping criterion for eigendecomposition of the Laplacian matrix when ``eigen_solver='arpack'``. - assign_labels : {'kmeans', 'discretize'}, default='kmeans' + assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans' The strategy for assigning labels in the embedding space. There are two ways to assign labels after the Laplacian embedding. k-means is a popular choice, but it can be sensitive to initialization. From bcb9e8b4ca398cd5568abc4a7291d371d325d67c Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 13:42:24 -0400 Subject: [PATCH 02/90] Update _spectral.py lint fix --- sklearn/cluster/_spectral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 73c644cb0f352..b92a438e3ac67 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -311,8 +311,8 @@ def spectral_clustering( if assign_labels not in ("kmeans", "discretize", 'cluster_qr'): raise ValueError( "The 'assign_labels' parameter should be " - "'kmeans' or 'discretize', or 'cluster_qr', but '%s' was given" - % assign_labels + "'kmeans' or 'discretize', or 'cluster_qr', " + "but '%s' was given" % assign_labels ) if isinstance(affinity, np.matrix): raise TypeError( From d521873ea849fd6c5893ab62ac5c1942b5f3decb Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 13:44:39 -0400 Subject: [PATCH 03/90] Update _spectral.py trailing space fixed --- sklearn/cluster/_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index b92a438e3ac67..1c0e2778dad44 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -311,7 +311,7 @@ def spectral_clustering( if assign_labels not in ("kmeans", "discretize", 'cluster_qr'): raise ValueError( "The 'assign_labels' parameter should be " - "'kmeans' or 'discretize', or 'cluster_qr', " + "'kmeans' or 'discretize', or 'cluster_qr', " "but '%s' was given" % assign_labels ) if isinstance(affinity, np.matrix): From e72bbd9ae636a98c408a212b4007cd9ee1051ff0 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 13:55:29 -0400 Subject: [PATCH 04/90] Update _spectral.py line too long --- sklearn/cluster/_spectral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 1c0e2778dad44..0f01de451f236 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -265,8 +265,8 @@ def spectral_clustering( also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization [3]_. The newest cluster_qr method directly extract clusters from eigenvectors - in spectral clustering. In contrast to k-means and discretization, - cluster_qr has no tuning parameters, e.g., runs no iterations, yet may outperform + in spectral clustering. In contrast to k-means and discretization, cluster_qr + has no tuning parametersand runs no iterations, yet may outperform k-means and discretization in terms of both quality and speed. verbose : bool, default=False From 734dd37c7f0e31aaa3e62c30458991272db88e2a Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 14:11:02 -0400 Subject: [PATCH 05/90] Update test_spectral.py added "cluster_qr" --- sklearn/cluster/tests/test_spectral.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 679adf27520e4..3561c48eb064d 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -12,6 +12,7 @@ from sklearn.utils._testing import assert_array_equal from sklearn.cluster import SpectralClustering, spectral_clustering +from sklearn.cluster._spectral import cluster_qr from sklearn.cluster._spectral import discretize from sklearn.feature_extraction import img_to_graph from sklearn.metrics import pairwise_distances @@ -29,7 +30,9 @@ @pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg")) -@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize")) +@pytest.mark.parametrize( + "assign_labels", + ("kmeans", "discretize", "cluster_qr")) def test_spectral_clustering(eigen_solver, assign_labels): S = np.array( [ @@ -283,7 +286,7 @@ def test_n_components(): assert not np.array_equal(labels, labels_diff_ncomp) -@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize")) +@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) def test_verbose(assign_labels, capsys): # Check verbose mode of KMeans for better coverage. X, y = make_blobs( From 360b2f71528a4c3bcbbed02eda3a9acd3db3a57a Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 14:43:00 -0400 Subject: [PATCH 06/90] Update test_spectral.py lint fix --- sklearn/cluster/tests/test_spectral.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 3561c48eb064d..747344e5563cc 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -12,7 +12,6 @@ from sklearn.utils._testing import assert_array_equal from sklearn.cluster import SpectralClustering, spectral_clustering -from sklearn.cluster._spectral import cluster_qr from sklearn.cluster._spectral import discretize from sklearn.feature_extraction import img_to_graph from sklearn.metrics import pairwise_distances From 9a1f70fff79b57903e1de61f9054d39bb511c168 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 14:56:37 -0400 Subject: [PATCH 07/90] Update plot_coin_segmentation.py added "cluster_qr" --- examples/cluster/plot_coin_segmentation.py | 33 +++++++++++++++------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 9fb9b11be2753..96718a4925e19 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -10,12 +10,15 @@ This procedure (spectral clustering on an image) is an efficient approximate solution for finding normalized graph cuts. -There are two options to assign labels: +There are three options to assign labels: * with 'kmeans' spectral clustering will cluster samples in the embedding space using a kmeans algorithm +* with 'cluster_qr' will cluster samples in the embedding space + using a cluster_qr algorithm, * whereas 'discrete' will iteratively search for the closest partition space to the embedding space. + """ print(__doc__) @@ -64,26 +67,36 @@ # Apply spectral clustering (this step goes much faster if you have pyamg # installed) -N_REGIONS = 25 + +# The actual number of regions in this example is 27: background and 26 coins +N_REGIONS = 26 # %% -# Visualize the resulting regions +# Compute and visualize the resulting regions -for assign_labels in ('kmeans', 'discretize'): +# Any eigen_solver: 'arpack', 'lobpcg', 'amg' can be used. AMG is usually best +# It often helps the spectral clustering to compute a few extra eigenvectors +N_REGIONS_PLUS = 3 + +for assign_labels in ('kmeans', 'discretize', 'cluster_qr'): t0 = time.time() - labels = spectral_clustering(graph, n_clusters=N_REGIONS, - assign_labels=assign_labels, random_state=42) + labels = spectral_clustering(graph, + n_clusters=(N_REGIONS + N_REGIONS_PLUS), + assign_labels=assign_labels, random_state=42, + eigen_solver='arpack') t1 = time.time() labels = labels.reshape(rescaled_coins.shape) plt.figure(figsize=(5, 5)) - plt.imshow(rescaled_coins, cmap=plt.cm.gray) - for l in range(N_REGIONS): - plt.contour(labels == l, - colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))]) + plt.imshow(rescaled_coins, cmap=plt.get_cmap('gray')) + plt.xticks(()) plt.yticks(()) title = 'Spectral clustering: %s, %.2fs' % (assign_labels, (t1 - t0)) print(title) plt.title(title) + for l in range(N_REGIONS): + plt.contour(labels == l, + colors=[plt.cm.nipy_spectral((l+3) / float(N_REGIONS+3))]) + plt.pause(0.5) plt.show() From 3f87ceedae226d4bff7db74d75caeaf607a3394b Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 15:02:18 -0400 Subject: [PATCH 08/90] Update clustering.rst added cluster_qr --- doc/modules/clustering.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 65f33fe1fbebb..39cc2eece08be 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -497,6 +497,10 @@ computed using a function of a gradient of the image. :target: ../auto_examples/cluster/plot_coin_segmentation.html :scale: 65 +.. |coin_cluster_qr| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_003.png + :target: ../auto_examples/cluster/plot_coin_segmentation.html + :scale: 65 + Different label assignment strategies ------------------------------------- @@ -507,6 +511,8 @@ In particular, unless you control the ``random_state``, it may not be reproducible from run-to-run, as it depends on random initialization. The alternative ``"discretize"`` strategy is 100% reproducible, but tends to create parcels of fairly even and geometrical shape. +The recently added option ``clusterQR`` is 100% also reproducible and tends +to create the visually best partitioning. ===================================== ===================================== ``assign_labels="kmeans"`` ``assign_labels="discretize"`` From 5fe24dc172e259e74b89aa6688117b56d58e5cf3 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 15:11:05 -0400 Subject: [PATCH 09/90] Update plot_coin_segmentation.py E128 continuation line under-indented fixed --- examples/cluster/plot_coin_segmentation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 96718a4925e19..a2a8569e94743 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -81,9 +81,9 @@ for assign_labels in ('kmeans', 'discretize', 'cluster_qr'): t0 = time.time() labels = spectral_clustering(graph, - n_clusters=(N_REGIONS + N_REGIONS_PLUS), - assign_labels=assign_labels, random_state=42, - eigen_solver='arpack') + n_clusters=(N_REGIONS + N_REGIONS_PLUS), + assign_labels=assign_labels, random_state=42, + eigen_solver='arpack') t1 = time.time() labels = labels.reshape(rescaled_coins.shape) From 10e649bf5b39f783d22b840142d94267eae1d327 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 16:17:07 -0400 Subject: [PATCH 10/90] Update v1.1.rst added PR #21148 info --- doc/whats_new/v1.1.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 3aabed6214771..8824efaba3694 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -38,6 +38,16 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. +:mod:`sklearn.cluster` +.................... + +- |Enhancement| :func:`cluster._spectral` now includes the 'cluster_qr' method + that clusters samples in the embedding space just as 'kmeans' and 'discrete'. + :func:`cluster.plot_coin_segmentation' now compares all three alternatives. + Documentation :doc: 'modules/clustering' and unit :test: 'test_spectral.py' + have been updated to incorporate 'cluster_qr'. + :pr:`21148` by :user:`Andrew Knyazev ` + :mod:`sklearn.linear_model` ........................... From 70f0c400aa7cf4cb6f08ffc8ff3f46e6d0c29097 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 16:41:11 -0400 Subject: [PATCH 11/90] Update v1.1.rst formatting fixed --- doc/whats_new/v1.1.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 8824efaba3694..8ef9d68dbe53e 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -43,8 +43,8 @@ Changelog - |Enhancement| :func:`cluster._spectral` now includes the 'cluster_qr' method that clusters samples in the embedding space just as 'kmeans' and 'discrete'. - :func:`cluster.plot_coin_segmentation' now compares all three alternatives. - Documentation :doc: 'modules/clustering' and unit :test: 'test_spectral.py' + :func:`cluster.plot_coin_segmentation` now compares all three alternatives. + Documentation :doc: `modules/clustering` and unit :test: `test_spectral.py` have been updated to incorporate 'cluster_qr'. :pr:`21148` by :user:`Andrew Knyazev ` From fa943304fb1c781286ebe4ab33af2f2d48ee8d27 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 17:20:57 -0400 Subject: [PATCH 12/90] Update v1.1.rst title underline fixed --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 8ef9d68dbe53e..f14a849315676 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -39,7 +39,7 @@ Changelog where 123456 is the *pull request* number, not the issue number. :mod:`sklearn.cluster` -.................... +...................... - |Enhancement| :func:`cluster._spectral` now includes the 'cluster_qr' method that clusters samples in the embedding space just as 'kmeans' and 'discrete'. From a52aec01cabeef34bc4e04fbe731cbfb6a1fe8cd Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sat, 25 Sep 2021 19:22:11 -0400 Subject: [PATCH 13/90] Update clustering.rst added cluster_qr to plots --- doc/modules/clustering.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 39cc2eece08be..5334657aedbc7 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -514,11 +514,11 @@ to create parcels of fairly even and geometrical shape. The recently added option ``clusterQR`` is 100% also reproducible and tends to create the visually best partitioning. -===================================== ===================================== - ``assign_labels="kmeans"`` ``assign_labels="discretize"`` -===================================== ===================================== -|coin_kmeans| |coin_discretize| -===================================== ===================================== +================================ ================================ ================================ + ``assign_labels="kmeans"`` ``assign_labels="discretize"`` ``assign_labels="cluster_qr"`` +================================ ================================ ================================ +|coin_kmeans| |coin_discretize| |coin_cluster_qr| +================================ ================================ ================================ Spectral Clustering Graphs -------------------------- From e661e22a9931f63b2742b10f5e5bfea8a857d94c Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 27 Sep 2021 12:22:15 -0400 Subject: [PATCH 14/90] Update _spectral.py black formatting --- sklearn/cluster/_spectral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 0f01de451f236..65b6abac9a5b5 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -308,7 +308,7 @@ def spectral_clustering( This algorithm solves the normalized cut for k=2: it is a normalized spectral clustering. """ - if assign_labels not in ("kmeans", "discretize", 'cluster_qr'): + if assign_labels not in ("kmeans", "discretize", "cluster_qr"): raise ValueError( "The 'assign_labels' parameter should be " "'kmeans' or 'discretize', or 'cluster_qr', " @@ -346,7 +346,7 @@ def spectral_clustering( _, labels, _ = k_means( maps, n_clusters, random_state=random_state, n_init=n_init, verbose=verbose ) - elif assign_labels == 'cluster_qr': + elif assign_labels == "cluster_qr": labels = cluster_qr(maps) else: labels = discretize(maps, random_state=random_state) From 79504f225d4c282c54d2d7e147ee94a241ace230 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 27 Sep 2021 12:26:22 -0400 Subject: [PATCH 15/90] Update test_spectral.py black formatting --- sklearn/cluster/tests/test_spectral.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 747344e5563cc..519c09436303e 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -29,9 +29,7 @@ @pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg")) -@pytest.mark.parametrize( - "assign_labels", - ("kmeans", "discretize", "cluster_qr")) +@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) def test_spectral_clustering(eigen_solver, assign_labels): S = np.array( [ From 24fcf28f4aa775d501ecad7b655809ae510a662e Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 27 Sep 2021 15:20:05 -0400 Subject: [PATCH 16/90] Update test_spectral.py trying to change the discretize test by itself to also test kmeans by itself and cluster_qr by itself --- sklearn/cluster/tests/test_spectral.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 519c09436303e..4d1d6c818e2d5 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -191,9 +191,10 @@ def histogram(x, y, **kwargs): sp.fit(X) +@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) -def test_discretize(n_samples): - # Test the discretize using a noise assignment matrix +def test_direct_clustering(n_samples, assign_labels): + # Test direct clustering using a noise assignment matrix random_state = np.random.RandomState(seed=8) for n_class in range(2, 10): # random class labels @@ -207,7 +208,7 @@ def test_discretize(n_samples): y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn( n_samples, n_class + 1 ) - y_pred = discretize(y_true_noisy, random_state=random_state) + y_pred = assign_labels(y_true_noisy, random_state=random_state) assert adjusted_rand_score(y_true, y_pred) > 0.8 From 898a287f8937f995a80f3819de149166007384f2 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 27 Sep 2021 15:38:23 -0400 Subject: [PATCH 17/90] Update test_spectral.py adding test cluster_qr by itself to the same test with discretize --- sklearn/cluster/tests/test_spectral.py | 33 +++++++++++++------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 4d1d6c818e2d5..00d438c197919 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -12,7 +12,7 @@ from sklearn.utils._testing import assert_array_equal from sklearn.cluster import SpectralClustering, spectral_clustering -from sklearn.cluster._spectral import discretize +from sklearn.cluster._spectral import discretize, cluster_qr from sklearn.feature_extraction import img_to_graph from sklearn.metrics import pairwise_distances from sklearn.metrics import adjusted_rand_score @@ -191,25 +191,26 @@ def histogram(x, y, **kwargs): sp.fit(X) -@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) +@pytest.mark.parametrize("assign_labels", ("discretize", "cluster_qr")) @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) def test_direct_clustering(n_samples, assign_labels): # Test direct clustering using a noise assignment matrix random_state = np.random.RandomState(seed=8) - for n_class in range(2, 10): - # random class labels - y_true = random_state.randint(0, n_class + 1, n_samples) - y_true = np.array(y_true, float) - # noise class assignment matrix - y_indicator = sparse.coo_matrix( - (np.ones(n_samples), (np.arange(n_samples), y_true)), - shape=(n_samples, n_class + 1), - ) - y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn( - n_samples, n_class + 1 - ) - y_pred = assign_labels(y_true_noisy, random_state=random_state) - assert adjusted_rand_score(y_true, y_pred) > 0.8 + for fn in [assign_labels]: + for n_class in range(2, 10): + # random class labels + y_true = random_state.randint(0, n_class + 1, n_samples) + y_true = np.array(y_true, float) + # noise class assignment matrix + y_indicator = sparse.coo_matrix( + (np.ones(n_samples), (np.arange(n_samples), y_true)), + shape=(n_samples, n_class + 1), + ) + y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn( + n_samples, n_class + 1 + ) + y_pred = fn(y_true_noisy, random_state=random_state) + assert adjusted_rand_score(y_true, y_pred) > 0.8 # TODO: Remove when pyamg does replaces sp.rand call with np.random.rand From fb6494581637bb78385c27f0c45dc4dcdf65bc0d Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 27 Sep 2021 15:46:14 -0400 Subject: [PATCH 18/90] Update test_spectral.py error fixes --- sklearn/cluster/tests/test_spectral.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 00d438c197919..8c7fc9772b06d 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -191,12 +191,11 @@ def histogram(x, y, **kwargs): sp.fit(X) -@pytest.mark.parametrize("assign_labels", ("discretize", "cluster_qr")) @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) def test_direct_clustering(n_samples, assign_labels): # Test direct clustering using a noise assignment matrix random_state = np.random.RandomState(seed=8) - for fn in [assign_labels]: + for fn in [discretize, cluster_qr]: for n_class in range(2, 10): # random class labels y_true = random_state.randint(0, n_class + 1, n_samples) From f8622453e72c7d5b23117e5d0a9da64ee06092c5 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 27 Sep 2021 17:34:45 -0400 Subject: [PATCH 19/90] Update test_spectral.py error fix --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 8c7fc9772b06d..77f636de46593 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -192,7 +192,7 @@ def histogram(x, y, **kwargs): @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) -def test_direct_clustering(n_samples, assign_labels): +def test_direct_clustering(n_samples): # Test direct clustering using a noise assignment matrix random_state = np.random.RandomState(seed=8) for fn in [discretize, cluster_qr]: From 41759aa41da3815358e149669d2c843d40906274 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 27 Sep 2021 18:51:20 -0400 Subject: [PATCH 20/90] Update test_spectral.py cluster_qr apparently requires n_class>2, so change the test to start with n_class=3 --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 77f636de46593..f0e5dbd0b0528 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -196,7 +196,7 @@ def test_direct_clustering(n_samples): # Test direct clustering using a noise assignment matrix random_state = np.random.RandomState(seed=8) for fn in [discretize, cluster_qr]: - for n_class in range(2, 10): + for n_class in range(3, 10): # random class labels y_true = random_state.randint(0, n_class + 1, n_samples) y_true = np.array(y_true, float) From 7f8c60fafaab9e7b484692aa2321981d66ab0bce Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 27 Sep 2021 20:59:58 -0400 Subject: [PATCH 21/90] Update test_spectral.py reverted to working version --- sklearn/cluster/tests/test_spectral.py | 35 +++++++++++++------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index f0e5dbd0b0528..519c09436303e 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -12,7 +12,7 @@ from sklearn.utils._testing import assert_array_equal from sklearn.cluster import SpectralClustering, spectral_clustering -from sklearn.cluster._spectral import discretize, cluster_qr +from sklearn.cluster._spectral import discretize from sklearn.feature_extraction import img_to_graph from sklearn.metrics import pairwise_distances from sklearn.metrics import adjusted_rand_score @@ -192,24 +192,23 @@ def histogram(x, y, **kwargs): @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) -def test_direct_clustering(n_samples): - # Test direct clustering using a noise assignment matrix +def test_discretize(n_samples): + # Test the discretize using a noise assignment matrix random_state = np.random.RandomState(seed=8) - for fn in [discretize, cluster_qr]: - for n_class in range(3, 10): - # random class labels - y_true = random_state.randint(0, n_class + 1, n_samples) - y_true = np.array(y_true, float) - # noise class assignment matrix - y_indicator = sparse.coo_matrix( - (np.ones(n_samples), (np.arange(n_samples), y_true)), - shape=(n_samples, n_class + 1), - ) - y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn( - n_samples, n_class + 1 - ) - y_pred = fn(y_true_noisy, random_state=random_state) - assert adjusted_rand_score(y_true, y_pred) > 0.8 + for n_class in range(2, 10): + # random class labels + y_true = random_state.randint(0, n_class + 1, n_samples) + y_true = np.array(y_true, float) + # noise class assignment matrix + y_indicator = sparse.coo_matrix( + (np.ones(n_samples), (np.arange(n_samples), y_true)), + shape=(n_samples, n_class + 1), + ) + y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn( + n_samples, n_class + 1 + ) + y_pred = discretize(y_true_noisy, random_state=random_state) + assert adjusted_rand_score(y_true, y_pred) > 0.8 # TODO: Remove when pyamg does replaces sp.rand call with np.random.rand From bbab00ae84206ffd76b58d331f0ff252416ecf35 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 28 Sep 2021 14:33:32 -0400 Subject: [PATCH 22/90] Update test_spectral.py added a test of cluster_qr itself --- sklearn/cluster/tests/test_spectral.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 519c09436303e..8a295cc3c0e73 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -191,6 +191,14 @@ def histogram(x, y, **kwargs): sp.fit(X) +def test_cluster_qr(): + # Test cluster_qr for fixed data + random_state = np.random.RandomState(seed=8) + data = random_state.randn(10, 5) + labels = cluster_qr(data) + assert labels == [2 1 3 3 2 4 1 3 4 0] + + @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) def test_discretize(n_samples): # Test the discretize using a noise assignment matrix From 53dca44196a8cb91c93150f81127c39513fa39cb Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 28 Sep 2021 14:42:15 -0400 Subject: [PATCH 23/90] Update test_spectral.py error fix --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 8a295cc3c0e73..1a46efc1268c3 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -196,7 +196,7 @@ def test_cluster_qr(): random_state = np.random.RandomState(seed=8) data = random_state.randn(10, 5) labels = cluster_qr(data) - assert labels == [2 1 3 3 2 4 1 3 4 0] + assert not np.array_equal(labels, [2, 1, 3, 3, 2, 4, 1, 3, 4, 0]) @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) From a028dbafbae2b3de07dce5cb31a3025ca0a28585 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 28 Sep 2021 14:46:09 -0400 Subject: [PATCH 24/90] Update test_spectral.py error fix --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 1a46efc1268c3..055209eb3aea4 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -12,7 +12,7 @@ from sklearn.utils._testing import assert_array_equal from sklearn.cluster import SpectralClustering, spectral_clustering -from sklearn.cluster._spectral import discretize +from sklearn.cluster._spectral import discretize, cluster_qr from sklearn.feature_extraction import img_to_graph from sklearn.metrics import pairwise_distances from sklearn.metrics import adjusted_rand_score From 23f35dfb2270b42a8bf0ea4c99562328f32d4938 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 28 Sep 2021 15:08:12 -0400 Subject: [PATCH 25/90] Update test_spectral.py error fix --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 055209eb3aea4..603f79c43edd4 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -196,7 +196,7 @@ def test_cluster_qr(): random_state = np.random.RandomState(seed=8) data = random_state.randn(10, 5) labels = cluster_qr(data) - assert not np.array_equal(labels, [2, 1, 3, 3, 2, 4, 1, 3, 4, 0]) + assert not np.array_equal(labels, np.array([2, 1, 3, 3, 2, 4, 1, 3, 4, 0])) @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) From 6fa84241644a20417062da7c9b86c254e971cb7c Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 28 Sep 2021 15:09:30 -0400 Subject: [PATCH 26/90] Update test_spectral.py error fix --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 603f79c43edd4..1981911e80737 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -196,7 +196,7 @@ def test_cluster_qr(): random_state = np.random.RandomState(seed=8) data = random_state.randn(10, 5) labels = cluster_qr(data) - assert not np.array_equal(labels, np.array([2, 1, 3, 3, 2, 4, 1, 3, 4, 0])) + assert np.array_equal(labels, np.array([2, 1, 3, 3, 2, 4, 1, 3, 4, 0])) @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) From 9f92b5a7211fd5f1339ab35f373aff555f36f72e Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 28 Sep 2021 15:59:15 -0400 Subject: [PATCH 27/90] Update test_spectral.py added dtypes ["np.float32", "np.float64"] to the test --- sklearn/cluster/tests/test_spectral.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 1981911e80737..43c0e97c531c9 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -191,10 +191,11 @@ def histogram(x, y, **kwargs): sp.fit(X) -def test_cluster_qr(): - # Test cluster_qr for fixed data +@pytest.mark.parametrize("_dtype", ["np.float32", "np.float64"]) +def test_cluster_qr(_dtype): + # Test cluster_qr for fixed data different dtypes random_state = np.random.RandomState(seed=8) - data = random_state.randn(10, 5) + data = random_state.randn(10, 5).astype(_dtype) labels = cluster_qr(data) assert np.array_equal(labels, np.array([2, 1, 3, 3, 2, 4, 1, 3, 4, 0])) From 5cdc1ea49a00e0ad29e9b838a0073cca0ac6cd06 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 28 Sep 2021 16:21:11 -0400 Subject: [PATCH 28/90] Update test_spectral.py changed dtypes to ["single", "double"] --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 43c0e97c531c9..79daa0d6c9080 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -191,7 +191,7 @@ def histogram(x, y, **kwargs): sp.fit(X) -@pytest.mark.parametrize("_dtype", ["np.float32", "np.float64"]) +@pytest.mark.parametrize("_dtype", ["single", "double"]) def test_cluster_qr(_dtype): # Test cluster_qr for fixed data different dtypes random_state = np.random.RandomState(seed=8) From 8454a692f4870cd0285d6bef73a2ae01fe830110 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 28 Sep 2021 17:14:44 -0400 Subject: [PATCH 29/90] Update clustering.rst changed plots scale from 65 to 35 to make space for cluster_qr and generate docs --- doc/modules/clustering.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 5334657aedbc7..e68a51709bb0a 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -491,15 +491,15 @@ computed using a function of a gradient of the image. .. |coin_kmeans| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_001.png :target: ../auto_examples/cluster/plot_coin_segmentation.html - :scale: 65 + :scale: 35 .. |coin_discretize| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_002.png :target: ../auto_examples/cluster/plot_coin_segmentation.html - :scale: 65 + :scale: 35 .. |coin_cluster_qr| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_003.png :target: ../auto_examples/cluster/plot_coin_segmentation.html - :scale: 65 + :scale: 35 Different label assignment strategies ------------------------------------- @@ -517,7 +517,7 @@ to create the visually best partitioning. ================================ ================================ ================================ ``assign_labels="kmeans"`` ``assign_labels="discretize"`` ``assign_labels="cluster_qr"`` ================================ ================================ ================================ -|coin_kmeans| |coin_discretize| |coin_cluster_qr| +|coin_kmeans| |coin_discretize| |coin_cluster_qr| ================================ ================================ ================================ Spectral Clustering Graphs From 82c35438a9358dc37a6396e57e0e2b557ec05244 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Wed, 29 Sep 2021 10:39:59 -0400 Subject: [PATCH 30/90] Update test_spectral.py test all 3 options ("assign_labels", ("kmeans", "discretize", "cluster_qr")) in test_spectral_clustering_sparse --- sklearn/cluster/tests/test_spectral.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 79daa0d6c9080..f164a252f7c5b 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -101,6 +101,7 @@ def test_spectral_unknown_assign_labels(): spectral_clustering(S, n_clusters=2, random_state=0, assign_labels="") +@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) def test_spectral_clustering_sparse(): X, y = make_blobs( n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01 @@ -111,7 +112,11 @@ def test_spectral_clustering_sparse(): S = sparse.coo_matrix(S) labels = ( - SpectralClustering(random_state=0, n_clusters=2, affinity="precomputed") + SpectralClustering( + random_state=0, + n_clusters=2, + affinity="precomputed", + assign_labels=assign_labels) .fit(S) .labels_ ) From 524228ceaca699102a7c85d1b8404460834bec49 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Wed, 29 Sep 2021 11:07:22 -0400 Subject: [PATCH 31/90] Update test_spectral.py black formatting --- sklearn/cluster/tests/test_spectral.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index f164a252f7c5b..735573cb9ba2e 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -116,7 +116,8 @@ def test_spectral_clustering_sparse(): random_state=0, n_clusters=2, affinity="precomputed", - assign_labels=assign_labels) + assign_labels=assign_labels, + ) .fit(S) .labels_ ) From d992122747057ec7867b1e99e77e0dc3ab130f6d Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Wed, 29 Sep 2021 11:40:59 -0400 Subject: [PATCH 32/90] Update test_spectral.py lint formatting errors fix --- sklearn/cluster/tests/test_spectral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 735573cb9ba2e..a52a87d0ae5b9 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -49,7 +49,7 @@ def test_spectral_clustering(eigen_solver, assign_labels): n_clusters=2, affinity="precomputed", eigen_solver=eigen_solver, - assign_labels=assign_labels, + assign_labels=assign_labels ).fit(mat) labels = model.labels_ if labels[0] == 0: @@ -116,7 +116,7 @@ def test_spectral_clustering_sparse(): random_state=0, n_clusters=2, affinity="precomputed", - assign_labels=assign_labels, + assign_labels=assign_labels ) .fit(S) .labels_ From e207aa43990ad952657137f8df46e463c005ce71 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Wed, 29 Sep 2021 11:50:30 -0400 Subject: [PATCH 33/90] Update test_spectral.py remove all trailing commas in multi-line function call for consistency --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index a52a87d0ae5b9..f2196b30c7a7a 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -140,7 +140,7 @@ def test_precomputed_nearest_neighbors_filtering(): random_state=0, n_clusters=2, affinity="precomputed_nearest_neighbors", - n_neighbors=n_neighbors, + n_neighbors=n_neighbors ) .fit(graph) .labels_ From 87b4ffdb8f32d31b60492cdf958010d4e01a0f4c Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Wed, 29 Sep 2021 11:58:20 -0400 Subject: [PATCH 34/90] Update test_spectral.py run black --- sklearn/cluster/tests/test_spectral.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index f2196b30c7a7a..735573cb9ba2e 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -49,7 +49,7 @@ def test_spectral_clustering(eigen_solver, assign_labels): n_clusters=2, affinity="precomputed", eigen_solver=eigen_solver, - assign_labels=assign_labels + assign_labels=assign_labels, ).fit(mat) labels = model.labels_ if labels[0] == 0: @@ -116,7 +116,7 @@ def test_spectral_clustering_sparse(): random_state=0, n_clusters=2, affinity="precomputed", - assign_labels=assign_labels + assign_labels=assign_labels, ) .fit(S) .labels_ @@ -140,7 +140,7 @@ def test_precomputed_nearest_neighbors_filtering(): random_state=0, n_clusters=2, affinity="precomputed_nearest_neighbors", - n_neighbors=n_neighbors + n_neighbors=n_neighbors, ) .fit(graph) .labels_ From 7d2b030f4bc74499f9db6aad7630ae96ae6c5bd5 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Wed, 29 Sep 2021 12:32:20 -0400 Subject: [PATCH 35/90] Update test_spectral.py error fix --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 735573cb9ba2e..f6243191f6441 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -102,7 +102,7 @@ def test_spectral_unknown_assign_labels(): @pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) -def test_spectral_clustering_sparse(): +def test_spectral_clustering_sparse(assign_labels): X, y = make_blobs( n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01 ) From 73a629532567d0f7de17b5980158f5fa5ecab784 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sun, 3 Oct 2021 22:37:46 -0400 Subject: [PATCH 36/90] Update clustering.rst minor --- doc/modules/clustering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index e68a51709bb0a..84c3b31f4b2d9 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -511,7 +511,7 @@ In particular, unless you control the ``random_state``, it may not be reproducible from run-to-run, as it depends on random initialization. The alternative ``"discretize"`` strategy is 100% reproducible, but tends to create parcels of fairly even and geometrical shape. -The recently added option ``clusterQR`` is 100% also reproducible and tends +The recently added option ``clusterQR`` is also 100% reproducible and tends to create the visually best partitioning. ================================ ================================ ================================ From bf5486e906d25f7e32f83ad6c0bc910ba40882a5 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sun, 3 Oct 2021 22:46:50 -0400 Subject: [PATCH 37/90] Update plot_coin_segmentation.py minor --- examples/cluster/plot_coin_segmentation.py | 25 +++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index a2a8569e94743..602c1cd8a311b 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -14,10 +14,10 @@ * with 'kmeans' spectral clustering will cluster samples in the embedding space using a kmeans algorithm -* with 'cluster_qr' will cluster samples in the embedding space - using a cluster_qr algorithm, +* with 'cluster_qr' spectral clustering will cluster samples in the embedding + space using a cluster_qr algorithm, * whereas 'discrete' will iteratively search for the closest partition - space to the embedding space. + space to the embedding space of spectral clustering. """ print(__doc__) @@ -66,24 +66,26 @@ graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps # Apply spectral clustering (this step goes much faster if you have pyamg -# installed) +# installed and use eigen_solver = 'amg'). However, any valid solver can +# be used (e.g., 'arpack', 'lobpcg', or 'amg'). +eigen_solver = 'arpack' # The actual number of regions in this example is 27: background and 26 coins -N_REGIONS = 26 +n_regions = 26 # %% # Compute and visualize the resulting regions # Any eigen_solver: 'arpack', 'lobpcg', 'amg' can be used. AMG is usually best # It often helps the spectral clustering to compute a few extra eigenvectors -N_REGIONS_PLUS = 3 +n_regions_plus = 3 for assign_labels in ('kmeans', 'discretize', 'cluster_qr'): t0 = time.time() labels = spectral_clustering(graph, - n_clusters=(N_REGIONS + N_REGIONS_PLUS), + n_clusters=(n_regions + n_regions_plus), assign_labels=assign_labels, random_state=42, - eigen_solver='arpack') + eigen_solver='eigen_solver') t1 = time.time() labels = labels.reshape(rescaled_coins.shape) @@ -95,8 +97,11 @@ title = 'Spectral clustering: %s, %.2fs' % (assign_labels, (t1 - t0)) print(title) plt.title(title) - for l in range(N_REGIONS): + for l in range(n_regions): plt.contour(labels == l, - colors=[plt.cm.nipy_spectral((l+3) / float(N_REGIONS+3))]) + colors=[plt.cm.nipy_spectral((l+3) / float(n_regions + 3))]) + colors = plt.cm.nipy_spectral((l + n_regions_plus) / + float(n_regions + n_regions_plus)) + plt.contour(labels == l, colors=colors) plt.pause(0.5) plt.show() From 20a18f2043c179f7ce9673debd39e9d270b41ea4 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sun, 3 Oct 2021 22:54:20 -0400 Subject: [PATCH 38/90] Update plot_coin_segmentation.py trailing whitespace removed --- examples/cluster/plot_coin_segmentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 602c1cd8a311b..850e901971b71 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -14,7 +14,7 @@ * with 'kmeans' spectral clustering will cluster samples in the embedding space using a kmeans algorithm -* with 'cluster_qr' spectral clustering will cluster samples in the embedding +* with 'cluster_qr' spectral clustering will cluster samples in the embedding space using a cluster_qr algorithm, * whereas 'discrete' will iteratively search for the closest partition space to the embedding space of spectral clustering. From c28b9c918d9e7083aac9f947a3344255c0ffadf7 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Sun, 3 Oct 2021 23:01:10 -0400 Subject: [PATCH 39/90] Update plot_coin_segmentation.py indentation fixed --- examples/cluster/plot_coin_segmentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 850e901971b71..5fcf4fd908aed 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -101,7 +101,7 @@ plt.contour(labels == l, colors=[plt.cm.nipy_spectral((l+3) / float(n_regions + 3))]) colors = plt.cm.nipy_spectral((l + n_regions_plus) / - float(n_regions + n_regions_plus)) + float(n_regions + n_regions_plus)) plt.contour(labels == l, colors=colors) plt.pause(0.5) plt.show() From 56c7bb43c15d2f1a07c78765874b867d6d6ab921 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 4 Oct 2021 00:09:15 -0400 Subject: [PATCH 40/90] Update plot_coin_segmentation.py minor error --- examples/cluster/plot_coin_segmentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 5fcf4fd908aed..b95d8bacfcfe0 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -85,7 +85,7 @@ labels = spectral_clustering(graph, n_clusters=(n_regions + n_regions_plus), assign_labels=assign_labels, random_state=42, - eigen_solver='eigen_solver') + eigen_solver=eigen_solver) t1 = time.time() labels = labels.reshape(rescaled_coins.shape) From 4d7112494500a39f5bb28a1d3ba2d3921cebdc0a Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 4 Oct 2021 01:42:48 -0400 Subject: [PATCH 41/90] Update plot_coin_segmentation.py error fixed --- examples/cluster/plot_coin_segmentation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index b95d8bacfcfe0..121d6b2797577 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -98,8 +98,6 @@ print(title) plt.title(title) for l in range(n_regions): - plt.contour(labels == l, - colors=[plt.cm.nipy_spectral((l+3) / float(n_regions + 3))]) colors = plt.cm.nipy_spectral((l + n_regions_plus) / float(n_regions + n_regions_plus)) plt.contour(labels == l, colors=colors) From 2ae0513038d6d0745c1875f2e078f4471b54060f Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 4 Oct 2021 01:55:32 -0400 Subject: [PATCH 42/90] Update _spectral.py proposed by @victorminden --- sklearn/cluster/_spectral.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 65b6abac9a5b5..2633616a85cce 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -21,18 +21,25 @@ def cluster_qr(vectors): """Search for a partition matrix (clustering) which is closest to the eigenvector embedding. + + This implementation was proposed in [1]_. + Parameters ---------- vectors : array-like, shape: (n_samples, n_clusters) The embedding space of the samples. + Returns ------- labels : array of integers, shape: n_samples The labels of the clusters. + References ---------- - https://github.com/asdamle/QR-spectral-clustering - https://arxiv.org/abs/1708.07481 + .. [1] `Simple, direct, and efficient multi-way spectral clustering, 2019 + Anil Damle, Victor Minden, Lexing Ying + `_ + """ from scipy.linalg import qr, svd @@ -40,10 +47,8 @@ def cluster_qr(vectors): k = vectors.shape[1] piv = qr(vectors.T, pivoting=True)[2] piv = piv[0:k] - UtSV = svd(vectors[piv, :].T) - Ut = UtSV[0] - Vt = UtSV[2].T.conj() - vectors = abs(np.dot(vectors, np.dot(Ut, Vt.T))) + ut, _, v = svd(vectors[piv[:k], :].T) + vectors = abs(np.dot(vectors, np.dot(ut, v.conj()))) return vectors.argmax(axis=1).T @@ -264,7 +269,7 @@ def spectral_clustering( embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization [3]_. - The newest cluster_qr method directly extract clusters from eigenvectors + The newest cluster_qr method [5]_ directly extract clusters from eigenvectors in spectral clustering. In contrast to k-means and discretization, cluster_qr has no tuning parametersand runs no iterations, yet may outperform k-means and discretization in terms of both quality and speed. @@ -300,9 +305,13 @@ def spectral_clustering( SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. `_ + .. [5] `Simple, direct, and efficient multi-way spectral clustering, 2019 + Anil Damle, Victor Minden, Lexing Ying + `_ + Notes ----- - The graph should contain only one connect component, elsewhere + The graph should contain only one connected component, elsewhere the results make little sense. This algorithm solves the normalized cut for k=2: it is a From 2f74cd7e118e45e11c7104359be70b2ad7fec556 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 4 Oct 2021 02:46:42 -0400 Subject: [PATCH 43/90] Update plot_coin_segmentation.py colors = plt.cm.nipy_spectral((l + n_regions_plus) / float(n_regions + n_regions_plus)) plt.contour(labels == l, colors=colors) -> plt.contour(labels == l, colors=[plt.cm.nipy_spectral((l+3) / float(N_REGIONS+3))]) --- examples/cluster/plot_coin_segmentation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 121d6b2797577..5ebc0144818d5 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -98,8 +98,7 @@ print(title) plt.title(title) for l in range(n_regions): - colors = plt.cm.nipy_spectral((l + n_regions_plus) / - float(n_regions + n_regions_plus)) - plt.contour(labels == l, colors=colors) + plt.contour(labels == l, + colors=[plt.cm.nipy_spectral((l+3) / float(N_REGIONS+3))]) plt.pause(0.5) plt.show() From e9926c60cd4103af12c02437204e9a95733d4e2d Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 4 Oct 2021 02:57:15 -0400 Subject: [PATCH 44/90] Update plot_coin_segmentation.py --- examples/cluster/plot_coin_segmentation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 5ebc0144818d5..03bed5cdb91eb 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -98,7 +98,7 @@ print(title) plt.title(title) for l in range(n_regions): - plt.contour(labels == l, - colors=[plt.cm.nipy_spectral((l+3) / float(N_REGIONS+3))]) - plt.pause(0.5) + colors = [plt.cm.nipy_spectral((l + 3) / float(n_regions + 3))] + plt.contour(labels == l, colors=colors) + plt.pause(0.5) plt.show() From 27efc117532fe5323f30cdbf6d85c01fd8a24567 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 4 Oct 2021 03:11:54 -0400 Subject: [PATCH 45/90] Update plot_coin_segmentation.py --- examples/cluster/plot_coin_segmentation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 03bed5cdb91eb..b5f8a9392f5fc 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -98,7 +98,7 @@ print(title) plt.title(title) for l in range(n_regions): - colors = [plt.cm.nipy_spectral((l + 3) / float(n_regions + 3))] - plt.contour(labels == l, colors=colors) - plt.pause(0.5) + colors = [plt.cm.nipy_spectral((l + 3) / float(n_regions + 3))] + plt.contour(labels == l, colors=colors) + plt.pause(0.5) plt.show() From 8e875f31f9c5942f1e9a557731252d48e5234f3a Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 4 Oct 2021 22:03:38 -0400 Subject: [PATCH 46/90] Update plot_coin_segmentation.py typo fixed --- examples/cluster/plot_coin_segmentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index b5f8a9392f5fc..4a322d093f437 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -70,7 +70,7 @@ # be used (e.g., 'arpack', 'lobpcg', or 'amg'). eigen_solver = 'arpack' -# The actual number of regions in this example is 27: background and 26 coins +# The number of regions n_regions = 26 # %% From 10077aad93c7a56c91095f4fa7b12a0798df6e89 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 11:56:36 -0400 Subject: [PATCH 47/90] Update sklearn/cluster/_spectral.py nitpick: we can use f-strings since we dropped Python 3.6 support. Co-authored-by: Olivier Grisel --- sklearn/cluster/_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 2633616a85cce..438c02ed9278a 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -321,7 +321,7 @@ def spectral_clustering( raise ValueError( "The 'assign_labels' parameter should be " "'kmeans' or 'discretize', or 'cluster_qr', " - "but '%s' was given" % assign_labels + f"but {assign_labels!r} was given" ) if isinstance(affinity, np.matrix): raise TypeError( From b35f1ea2177670964d5403157ebd3086134112a9 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 13:18:14 -0400 Subject: [PATCH 48/90] Update plot_coin_segmentation.py black --- examples/cluster/plot_coin_segmentation.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 8f51e75f798e6..7965ef0c33305 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -67,7 +67,7 @@ # Apply spectral clustering (this step goes much faster if you have pyamg # installed and use eigen_solver = 'amg'). However, any valid solver can # be used (e.g., 'arpack', 'lobpcg', or 'amg'). -eigen_solver = 'arpack' +eigen_solver = "arpack" # The number of regions n_regions = 26 @@ -79,18 +79,21 @@ # It often helps the spectral clustering to compute a few extra eigenvectors n_regions_plus = 3 -for assign_labels in ('kmeans', 'discretize', 'cluster_qr'): +for assign_labels in ("kmeans", "discretize", "cluster_qr"): t0 = time.time() - labels = spectral_clustering(graph, - n_clusters=(n_regions + n_regions_plus), - assign_labels=assign_labels, random_state=42, - eigen_solver=eigen_solver) + labels = spectral_clustering( + graph, + n_clusters=(n_regions + n_regions_plus), + assign_labels=assign_labels, + random_state=42, + eigen_solver=eigen_solver, + ) t1 = time.time() labels = labels.reshape(rescaled_coins.shape) plt.figure(figsize=(5, 5)) plt.imshow(rescaled_coins, cmap=plt.cm.gray) - + plt.xticks(()) plt.yticks(()) title = "Spectral clustering: %s, %.2fs" % (assign_labels, (t1 - t0)) From 84066a688e9f68c3ddeda51ae10ee78556ff5d2d Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 14:27:02 -0400 Subject: [PATCH 49/90] Update doc/whats_new/v1.1.rst minor edit Co-authored-by: Olivier Grisel --- doc/whats_new/v1.1.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index bed8e9797b3c7..d179015e7578e 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -49,10 +49,8 @@ Changelog ...................... - |Enhancement| :func:`cluster._spectral` now includes the 'cluster_qr' method - that clusters samples in the embedding space just as 'kmeans' and 'discrete'. - :func:`cluster.plot_coin_segmentation` now compares all three alternatives. - Documentation :doc: `modules/clustering` and unit :test: `test_spectral.py` - have been updated to incorporate 'cluster_qr'. + that clusters samples in the embedding space as an alternative to the existing + 'kmeans' and 'discrete' methods. See `spectral_clustering`_ for more details. :pr:`21148` by :user:`Andrew Knyazev ` :mod:`sklearn.ensemble` From 7974f2829112ecf38766ca2a68f3465f0a9377d4 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 14:28:34 -0400 Subject: [PATCH 50/90] Update sklearn/cluster/tests/test_spectral.py dtype edits Co-authored-by: Olivier Grisel --- sklearn/cluster/tests/test_spectral.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index f6243191f6441..a74d9d3fab5e2 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -197,11 +197,11 @@ def histogram(x, y, **kwargs): sp.fit(X) -@pytest.mark.parametrize("_dtype", ["single", "double"]) -def test_cluster_qr(_dtype): +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_cluster_qr(dtype): # Test cluster_qr for fixed data different dtypes random_state = np.random.RandomState(seed=8) - data = random_state.randn(10, 5).astype(_dtype) + data = random_state.randn(10, 5).astype(dtype) labels = cluster_qr(data) assert np.array_equal(labels, np.array([2, 1, 3, 3, 2, 4, 1, 3, 4, 0])) From 3e3d3ccd4073a6df947f922bb0d960345f902856 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 15:27:59 -0400 Subject: [PATCH 51/90] Update v1.1.rst https://github.com/scikit-learn/scikit-learn/pull/21148/files#r723413166 --- doc/whats_new/v1.1.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index d179015e7578e..8c6c63f35f114 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -48,7 +48,8 @@ Changelog :mod:`sklearn.cluster` ...................... -- |Enhancement| :func:`cluster._spectral` now includes the 'cluster_qr' method +- |Enhancement| :class:`cluster.SpectralClustering` and :func:`cluster.spectral` + now includes the new 'cluster_qr' method from :func:`cluster.cluster_qr` that clusters samples in the embedding space as an alternative to the existing 'kmeans' and 'discrete' methods. See `spectral_clustering`_ for more details. :pr:`21148` by :user:`Andrew Knyazev ` From 26dd8fe6cfee7c1d62f36d80dc17c16338ef45cd Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 15:45:26 -0400 Subject: [PATCH 52/90] Update plot_coin_segmentation.py .. versionchanged:: 1.1 Added new labeling method 'cluster_qr'. Etc --- examples/cluster/plot_coin_segmentation.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 7965ef0c33305..33db521abce0b 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -18,7 +18,12 @@ space using a cluster_qr algorithm, * whereas 'discrete' will iteratively search for the closest partition space to the embedding space of spectral clustering. - +.. versionchanged:: 1.1 + Added new labeling method 'cluster_qr'. + Changed the color scheme in plotting for better visibility. + Introduced a pause between plotting subsequent lables to visualize + individual labels when run manually. + Indtroduced an ablility to plot less lables than actually computed. """ print(__doc__) From 02d380450f17dac88c44b33bebfb4f25f0ffda01 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 15:47:44 -0400 Subject: [PATCH 53/90] Update plot_coin_segmentation.py 3 -> 4 --- examples/cluster/plot_coin_segmentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 33db521abce0b..9ed638beb0f5c 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -105,7 +105,7 @@ print(title) plt.title(title) for l in range(n_regions): - colors = [plt.cm.nipy_spectral((l + 3) / float(n_regions + 3))] + colors = [plt.cm.nipy_spectral((l + 4) / float(n_regions + 4))] plt.contour(labels == l, colors=colors) plt.pause(0.5) plt.show() From 98a4078c32eb31b5dab71025be55d7f4c5f27073 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 15:56:13 -0400 Subject: [PATCH 54/90] Update _spectral.py added .. versionchanged:: --- sklearn/cluster/_spectral.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 438c02ed9278a..0a2693028f7ea 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -40,6 +40,7 @@ def cluster_qr(vectors): Anil Damle, Victor Minden, Lexing Ying `_ + .. versionadded:: 1.1 """ from scipy.linalg import qr, svd @@ -274,6 +275,9 @@ def spectral_clustering( has no tuning parametersand runs no iterations, yet may outperform k-means and discretization in terms of both quality and speed. + .. versionchanged:: 1.1 + Added new labeling method 'cluster_qr'. + verbose : bool, default=False Verbosity mode. @@ -309,6 +313,9 @@ def spectral_clustering( Anil Damle, Victor Minden, Lexing Ying `_ + .. versionchanged:: 1.1 + Added new reference for the new labeling method 'cluster_qr'. + Notes ----- The graph should contain only one connected component, elsewhere @@ -458,6 +465,13 @@ class SpectralClustering(ClusterMixin, BaseEstimator): popular choice, but it can be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization [3]_. + The newest cluster_qr method [5]_ directly extract clusters from eigenvectors + in spectral clustering. In contrast to k-means and discretization, cluster_qr + has no tuning parametersand runs no iterations, yet may outperform + k-means and discretization in terms of both quality and speed. + + .. versionchanged:: 1.1 + Added new labeling method 'cluster_qr'. degree : float, default=3 Degree of the polynomial kernel. Ignored by other kernels. @@ -547,6 +561,13 @@ class SpectralClustering(ClusterMixin, BaseEstimator): SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. `_ + .. [5] `Simple, direct, and efficient multi-way spectral clustering, 2019 + Anil Damle, Victor Minden, Lexing Ying + `_ + + .. versionchanged:: 1.1 + Added new reference for the new labeling method 'cluster_qr'. + Examples -------- >>> from sklearn.cluster import SpectralClustering From 8b5c52d50347a73807bde00d07cd98eb2375d9f4 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 16:19:50 -0400 Subject: [PATCH 55/90] Update v1.1.rst Unknown reference to `spectral_clustering`_ ? removed --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 8c6c63f35f114..061013c8684f8 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -51,7 +51,7 @@ Changelog - |Enhancement| :class:`cluster.SpectralClustering` and :func:`cluster.spectral` now includes the new 'cluster_qr' method from :func:`cluster.cluster_qr` that clusters samples in the embedding space as an alternative to the existing - 'kmeans' and 'discrete' methods. See `spectral_clustering`_ for more details. + 'kmeans' and 'discrete' methods. See `spectral_clustering` for more details. :pr:`21148` by :user:`Andrew Knyazev ` :mod:`sklearn.ensemble` From e22e2c1782ac29c25f0ea8916ec289267767c9b6 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 16:57:05 -0400 Subject: [PATCH 56/90] Update _spectral.py .. versionchanged:: 1.1' spacing --- sklearn/cluster/_spectral.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 0a2693028f7ea..46f946973d6cf 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -275,8 +275,8 @@ def spectral_clustering( has no tuning parametersand runs no iterations, yet may outperform k-means and discretization in terms of both quality and speed. - .. versionchanged:: 1.1 - Added new labeling method 'cluster_qr'. + .. versionchanged:: 1.1 + Added new labeling method 'cluster_qr'. verbose : bool, default=False Verbosity mode. @@ -470,8 +470,8 @@ class SpectralClustering(ClusterMixin, BaseEstimator): has no tuning parametersand runs no iterations, yet may outperform k-means and discretization in terms of both quality and speed. - .. versionchanged:: 1.1 - Added new labeling method 'cluster_qr'. + .. versionchanged:: 1.1 + Added new labeling method 'cluster_qr'. degree : float, default=3 Degree of the polynomial kernel. Ignored by other kernels. From 8a2838d00a70f4bd91a026a077ec338c7db2bcfe Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 17:47:32 -0400 Subject: [PATCH 57/90] Update _spectral.py typos fixed --- sklearn/cluster/_spectral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 46f946973d6cf..bc1e99549a756 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -270,9 +270,9 @@ def spectral_clustering( embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization [3]_. - The newest cluster_qr method [5]_ directly extract clusters from eigenvectors + The newest cluster_qr method [5]_ directly extracts clusters from eigenvectors in spectral clustering. In contrast to k-means and discretization, cluster_qr - has no tuning parametersand runs no iterations, yet may outperform + has no tuning parameters and runs no iterations, yet may outperform k-means and discretization in terms of both quality and speed. .. versionchanged:: 1.1 From 81c749d68f9169a7917e6e6e9e4132a54cc6f095 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 17:49:07 -0400 Subject: [PATCH 58/90] Update plot_coin_segmentation.py typos fixed --- examples/cluster/plot_coin_segmentation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 9ed638beb0f5c..a9b4073850215 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -21,9 +21,9 @@ .. versionchanged:: 1.1 Added new labeling method 'cluster_qr'. Changed the color scheme in plotting for better visibility. - Introduced a pause between plotting subsequent lables to visualize + Introduced a pause between plotting subsequent labels to visualize individual labels when run manually. - Indtroduced an ablility to plot less lables than actually computed. + Introduced an ablility to plot fewer labels than actually computed. """ print(__doc__) From fabcb7b9783e4eb33c2775b46f6e9932db92688a Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 17:52:19 -0400 Subject: [PATCH 59/90] Update _spectral.py redundant line removed --- sklearn/cluster/_spectral.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index bc1e99549a756..c987e933ddbdb 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -47,7 +47,6 @@ def cluster_qr(vectors): k = vectors.shape[1] piv = qr(vectors.T, pivoting=True)[2] - piv = piv[0:k] ut, _, v = svd(vectors[piv[:k], :].T) vectors = abs(np.dot(vectors, np.dot(ut, v.conj()))) return vectors.argmax(axis=1).T From 3baa9daa78a8a7564df7f9620aded35d91a4ace8 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Thu, 7 Oct 2021 20:17:38 -0400 Subject: [PATCH 60/90] Update sklearn/cluster/_spectral.py light edit Co-authored-by: Victor Minden --- sklearn/cluster/_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index c987e933ddbdb..ce705ecfca964 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -466,7 +466,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator): initialization [3]_. The newest cluster_qr method [5]_ directly extract clusters from eigenvectors in spectral clustering. In contrast to k-means and discretization, cluster_qr - has no tuning parametersand runs no iterations, yet may outperform + has no tuning parameters and runs no iterations, yet may outperform k-means and discretization in terms of both quality and speed. .. versionchanged:: 1.1 From 234e026152fe29355589154dccc76ad2e95c8271 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Fri, 8 Oct 2021 00:45:20 -0400 Subject: [PATCH 61/90] Update plot_coin_segmentation.py removed specific eigen_solver, see https://github.com/scikit-learn/scikit-learn/pull/21148#discussion_r723439157 and added a TODO comment to reflect https://github.com/scikit-learn/scikit-learn/pull/21148#discussion_r723425034 --- examples/cluster/plot_coin_segmentation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index a9b4073850215..2ef4e7b38afeb 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -72,7 +72,6 @@ # Apply spectral clustering (this step goes much faster if you have pyamg # installed and use eigen_solver = 'amg'). However, any valid solver can # be used (e.g., 'arpack', 'lobpcg', or 'amg'). -eigen_solver = "arpack" # The number of regions n_regions = 26 @@ -91,7 +90,6 @@ n_clusters=(n_regions + n_regions_plus), assign_labels=assign_labels, random_state=42, - eigen_solver=eigen_solver, ) t1 = time.time() @@ -109,3 +107,6 @@ plt.contour(labels == l, colors=colors) plt.pause(0.5) plt.show() + +# TODO: After #21194 is merged and lobpcg is faster than amg, as expected, +# we should probably use eigen_solver = 'lopbcg' explicitly in this example. From 2552663a1b11bdbb78e0556dcf074a2b60465716 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Fri, 8 Oct 2021 00:48:48 -0400 Subject: [PATCH 62/90] Update plot_coin_segmentation.py trailing whitespace removed --- examples/cluster/plot_coin_segmentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 2ef4e7b38afeb..e19b654c05bc1 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -108,5 +108,5 @@ plt.pause(0.5) plt.show() -# TODO: After #21194 is merged and lobpcg is faster than amg, as expected, +# TODO: After #21194 is merged and lobpcg is faster than amg, as expected, # we should probably use eigen_solver = 'lopbcg' explicitly in this example. From d9f69eda1db10bd6ed645af0373a308016f3b547 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Fri, 8 Oct 2021 12:13:57 -0400 Subject: [PATCH 63/90] Update plot_coin_segmentation.py Introduced explicit eigen_tol=1e-7 Improved and extended the comments Commented out plt.pause(0.5) to speed up the auto test Manually tuned n_regions, n_regions_plus, and eigen_tol to speed up the test and still get good visuals. --- examples/cluster/plot_coin_segmentation.py | 27 ++++++++++++++-------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index e19b654c05bc1..f19e0945bf5a8 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -69,25 +69,31 @@ eps = 1e-6 graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps -# Apply spectral clustering (this step goes much faster if you have pyamg -# installed and use eigen_solver = 'amg'). However, any valid solver can -# be used (e.g., 'arpack', 'lobpcg', or 'amg'). - -# The number of regions +# The number of segmented regions to display needs to be chosen manually. +# The current version of 'spectral_clustering' does not support determining +# the number of good quality clusters automatically. n_regions = 26 # %% # Compute and visualize the resulting regions -# Any eigen_solver: 'arpack', 'lobpcg', 'amg' can be used. AMG is usually best -# It often helps the spectral clustering to compute a few extra eigenvectors +# Computing a few extra eigenvectors may speed up the eigen_solver. +# The spectral clustering quality may also benetif from requesting +# extra regions for segmentation. n_regions_plus = 3 +# Apply spectral clustering using the default eigen_solver='arpack'. +# Any implemented solver can be used: eigen_solver='arpack', 'lobpcg', or 'amg'. +# Choosing eigen_solver='amg' requires an extra package called 'pyamg'. +# The quality of segmentation and the speed of calculations is mostly determined +# by the choice of the solver and the value of the tolerance 'eigen_tol'. +# TODO: varying eigen_tol seems to have no effect for 'lobpcg' and 'amg' #21243. for assign_labels in ("kmeans", "discretize", "cluster_qr"): t0 = time.time() labels = spectral_clustering( graph, n_clusters=(n_regions + n_regions_plus), + eigen_tol=1e-7, assign_labels=assign_labels, random_state=42, ) @@ -105,8 +111,9 @@ for l in range(n_regions): colors = [plt.cm.nipy_spectral((l + 4) / float(n_regions + 4))] plt.contour(labels == l, colors=colors) - plt.pause(0.5) + # To view individual segments as appear comment in plt.pause(0.5) plt.show() -# TODO: After #21194 is merged and lobpcg is faster than amg, as expected, -# we should probably use eigen_solver = 'lopbcg' explicitly in this example. +# TODO: After #21194 is merged and #21243 is fixed, check which eigen_solver +# is the best and set eigen_solver='arpack', 'lobpcg', or 'amg' and eigen_tol +# explicitly in this example. From 7435c18916dc64dacdeb986b9522b60adda4761e Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Fri, 8 Oct 2021 12:22:31 -0400 Subject: [PATCH 64/90] Update plot_coin_segmentation.py black --- examples/cluster/plot_coin_segmentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index f19e0945bf5a8..01ebab5df4aa2 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -82,7 +82,7 @@ # extra regions for segmentation. n_regions_plus = 3 -# Apply spectral clustering using the default eigen_solver='arpack'. +# Apply spectral clustering using the default eigen_solver='arpack'. # Any implemented solver can be used: eigen_solver='arpack', 'lobpcg', or 'amg'. # Choosing eigen_solver='amg' requires an extra package called 'pyamg'. # The quality of segmentation and the speed of calculations is mostly determined From 05eaebd1237f51042b4c30cc69c0be3a016b89b6 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 12 Oct 2021 18:45:49 -0400 Subject: [PATCH 65/90] Apply suggestions from code review minor editing Co-authored-by: Olivier Grisel --- examples/cluster/plot_coin_segmentation.py | 6 ------ sklearn/cluster/_spectral.py | 11 ++--------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 01ebab5df4aa2..f31b4a422771f 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -18,12 +18,6 @@ space using a cluster_qr algorithm, * whereas 'discrete' will iteratively search for the closest partition space to the embedding space of spectral clustering. -.. versionchanged:: 1.1 - Added new labeling method 'cluster_qr'. - Changed the color scheme in plotting for better visibility. - Introduced a pause between plotting subsequent labels to visualize - individual labels when run manually. - Introduced an ablility to plot fewer labels than actually computed. """ print(__doc__) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index ce705ecfca964..0734c4bf36dd4 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -19,8 +19,7 @@ def cluster_qr(vectors): - """Search for a partition matrix (clustering) which is - closest to the eigenvector embedding. + """Find the discrete partition closest to the eigenvector embedding. This implementation was proposed in [1]_. @@ -49,7 +48,7 @@ def cluster_qr(vectors): piv = qr(vectors.T, pivoting=True)[2] ut, _, v = svd(vectors[piv[:k], :].T) vectors = abs(np.dot(vectors, np.dot(ut, v.conj()))) - return vectors.argmax(axis=1).T + return vectors.argmax(axis=1) def discretize( @@ -312,9 +311,6 @@ def spectral_clustering( Anil Damle, Victor Minden, Lexing Ying `_ - .. versionchanged:: 1.1 - Added new reference for the new labeling method 'cluster_qr'. - Notes ----- The graph should contain only one connected component, elsewhere @@ -564,9 +560,6 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Anil Damle, Victor Minden, Lexing Ying `_ - .. versionchanged:: 1.1 - Added new reference for the new labeling method 'cluster_qr'. - Examples -------- >>> from sklearn.cluster import SpectralClustering From 7c9b353a17c8e9dbb8001324a264e93c814bbcd7 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 12 Oct 2021 19:11:54 -0400 Subject: [PATCH 66/90] Update doc/modules/clustering.rst minor Co-authored-by: Olivier Grisel --- doc/modules/clustering.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 84c3b31f4b2d9..5ffceb2b0b671 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -511,8 +511,9 @@ In particular, unless you control the ``random_state``, it may not be reproducible from run-to-run, as it depends on random initialization. The alternative ``"discretize"`` strategy is 100% reproducible, but tends to create parcels of fairly even and geometrical shape. -The recently added option ``clusterQR`` is also 100% reproducible and tends -to create the visually best partitioning. +The recently added ``"cluster_qr"`` option is a deterministic alternative that +tends to create the visually best partitioning on the example application +below. ================================ ================================ ================================ ``assign_labels="kmeans"`` ``assign_labels="discretize"`` ``assign_labels="cluster_qr"`` From b635d649a4759049b8cde18cb99733c82ea68a8f Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 12 Oct 2021 20:54:15 -0400 Subject: [PATCH 67/90] Update test_spectral.py added test_cluster_qr_permutation_invariance as suggested --- sklearn/cluster/tests/test_spectral.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index a74d9d3fab5e2..320f39387d033 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -199,13 +199,25 @@ def histogram(x, y, **kwargs): @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_cluster_qr(dtype): - # Test cluster_qr for fixed data different dtypes + # Test cluster_qr for fixed data different dtypes return the same lables random_state = np.random.RandomState(seed=8) data = random_state.randn(10, 5).astype(dtype) labels = cluster_qr(data) assert np.array_equal(labels, np.array([2, 1, 3, 3, 2, 4, 1, 3, 4, 0])) +def test_cluster_qr_permutation_invariance(): + # Test that cluster_qr is invariant to sample permutation + random_state = np.random.RandomState(seed=8) + n_samples, n_components = 100, 5 + data = random_state.randn(n_samples, n_components) + perm = random_state.permutation(n_samples) + assert assert np.array_equal( + cluster_qr(data)[perm], + cluster_qr(data[perm]), + ) + + @pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) def test_discretize(n_samples): # Test the discretize using a noise assignment matrix From 370227438246744ae4f2940c99c15b6b8f1e187e Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 12 Oct 2021 20:57:47 -0400 Subject: [PATCH 68/90] Update test_spectral.py typo fixed --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 320f39387d033..3925ac7c25684 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -212,7 +212,7 @@ def test_cluster_qr_permutation_invariance(): n_samples, n_components = 100, 5 data = random_state.randn(n_samples, n_components) perm = random_state.permutation(n_samples) - assert assert np.array_equal( + assert np.array_equal( cluster_qr(data)[perm], cluster_qr(data[perm]), ) From af56485d2e02253f28a9a228cefc3d7a3097bb7d Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 12 Oct 2021 21:14:42 -0400 Subject: [PATCH 69/90] Update plot_coin_segmentation.py edited the DocString to address a suggestion --- examples/cluster/plot_coin_segmentation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index f31b4a422771f..0c99a30c96e77 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -12,12 +12,12 @@ There are three options to assign labels: -* with 'kmeans' spectral clustering will cluster samples in the embedding space +* 'kmeans' spectral clustering clusters samples in the embedding space using a kmeans algorithm -* with 'cluster_qr' spectral clustering will cluster samples in the embedding - space using a cluster_qr algorithm, -* whereas 'discrete' will iteratively search for the closest partition +* 'discrete' iteratively searchs for the closest partition space to the embedding space of spectral clustering. +* 'cluster_qr' assigns lables using the QR factorization with pivoting + that directly determines the partiion in the embedding space. """ print(__doc__) From 7e361fb8135995d22ed2d944d846449d3260f89f Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Wed, 13 Oct 2021 09:22:03 -0400 Subject: [PATCH 70/90] Update _spectral.py move all imports to the top --- sklearn/cluster/_spectral.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 0734c4bf36dd4..702dd635c9092 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -9,6 +9,13 @@ import numpy as np +# Required in cluster_qr +from scipy.linalg import qr, svd + +# Required in discretize +from scipy.sparse import csc_matrix +from scipy.linalg import LinAlgError + from ..base import BaseEstimator, ClusterMixin from ..utils import check_random_state, as_float_array from ..utils.deprecation import deprecated @@ -42,8 +49,6 @@ def cluster_qr(vectors): .. versionadded:: 1.1 """ - from scipy.linalg import qr, svd - k = vectors.shape[1] piv = qr(vectors.T, pivoting=True)[2] ut, _, v = svd(vectors[piv[:k], :].T) @@ -106,9 +111,6 @@ def discretize( """ - from scipy.sparse import csc_matrix - from scipy.linalg import LinAlgError - random_state = check_random_state(random_state) vectors = as_float_array(vectors, copy=copy) From 5f62fd46af88ded2a67447e28422984d30d9fda1 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Wed, 13 Oct 2021 10:05:39 -0400 Subject: [PATCH 71/90] Update _spectral.py Unrelated to this PR, but while at it, improved comments on eigensolvers --- sklearn/cluster/_spectral.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 702dd635c9092..849d6d1985478 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -235,10 +235,11 @@ def spectral_clustering( Number of eigenvectors to use for the spectral embedding eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} - The eigenvalue decomposition strategy to use. AMG requires pyamg - to be installed. It can be faster on very large, sparse problems, - but may also lead to instabilities. If None, then ``'arpack'`` is - used. See [4]_ for more details regarding `'lobpcg'`. + The eigenvalue decomposition method. If None then ``'arpack'`` is used. + See [4]_ for more details regarding ``'lobpcg'``. + Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional + Algebraic MultiGrid preconditioning and requires pyamg to be installed. + It can be faster on very large sparse problems, but may be instabile. random_state : int, RandomState instance, default=None A pseudo random number generator used for the initialization From 995e2f4b03d6bb79d809b0509e34443a166b587d Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Wed, 13 Oct 2021 10:23:10 -0400 Subject: [PATCH 72/90] Apply suggestions from code review editing Co-authored-by: Olivier Grisel --- sklearn/cluster/_spectral.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 849d6d1985478..556f4555c7061 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -30,6 +30,8 @@ def cluster_qr(vectors): This implementation was proposed in [1]_. +.. versionadded:: 1.1 + Parameters ---------- vectors : array-like, shape: (n_samples, n_clusters) @@ -46,7 +48,6 @@ def cluster_qr(vectors): Anil Damle, Victor Minden, Lexing Ying `_ - .. versionadded:: 1.1 """ k = vectors.shape[1] @@ -239,7 +240,7 @@ def spectral_clustering( See [4]_ for more details regarding ``'lobpcg'``. Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional Algebraic MultiGrid preconditioning and requires pyamg to be installed. - It can be faster on very large sparse problems, but may be instabile. + It can be faster on very large sparse problems, but may be unstable. random_state : int, RandomState instance, default=None A pseudo random number generator used for the initialization From df3c1aa71e3a7e394546922b6fcf5d463760ad7a Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Wed, 13 Oct 2021 10:51:34 -0400 Subject: [PATCH 73/90] Update _spectral.py black --- sklearn/cluster/_spectral.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 556f4555c7061..9a62ed9c639ef 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -28,25 +28,25 @@ def cluster_qr(vectors): """Find the discrete partition closest to the eigenvector embedding. - This implementation was proposed in [1]_. + This implementation was proposed in [1]_. -.. versionadded:: 1.1 + .. versionadded:: 1.1 - Parameters - ---------- - vectors : array-like, shape: (n_samples, n_clusters) - The embedding space of the samples. + Parameters + ---------- + vectors : array-like, shape: (n_samples, n_clusters) + The embedding space of the samples. - Returns - ------- - labels : array of integers, shape: n_samples - The labels of the clusters. + Returns + ------- + labels : array of integers, shape: n_samples + The labels of the clusters. - References - ---------- - .. [1] `Simple, direct, and efficient multi-way spectral clustering, 2019 - Anil Damle, Victor Minden, Lexing Ying - `_ + References + ---------- + .. [1] `Simple, direct, and efficient multi-way spectral clustering, 2019 + Anil Damle, Victor Minden, Lexing Ying + `_ """ From ceb88cc6ad9dead73991415b26af89495168969b Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Fri, 15 Oct 2021 10:09:06 -0400 Subject: [PATCH 74/90] Update v1.1.rst typo fixed --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index a4b5fbf553a55..1266bb8d9144b 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -49,7 +49,7 @@ Changelog ...................... - |Enhancement| :class:`cluster.SpectralClustering` and :func:`cluster.spectral` - now includes the new 'cluster_qr' method from :func:`cluster.cluster_qr` + now include the new 'cluster_qr' method from :func:`cluster.cluster_qr` that clusters samples in the embedding space as an alternative to the existing 'kmeans' and 'discrete' methods. See `spectral_clustering` for more details. :pr:`21148` by :user:`Andrew Knyazev ` From 1168562b28db18e70aa9c7c2e981bb7f759947b0 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 25 Oct 2021 21:26:43 -0400 Subject: [PATCH 75/90] Update examples/cluster/plot_coin_segmentation.py typos fixed Co-authored-by: Julien Jerphanion --- examples/cluster/plot_coin_segmentation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index ad1dfe554f928..e599c986646e0 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -14,10 +14,10 @@ * 'kmeans' spectral clustering clusters samples in the embedding space using a kmeans algorithm -* 'discrete' iteratively searchs for the closest partition +* 'discrete' iteratively searches for the closest partition space to the embedding space of spectral clustering. -* 'cluster_qr' assigns lables using the QR factorization with pivoting - that directly determines the partiion in the embedding space. +* 'cluster_qr' assigns labels using the QR factorization with pivoting + that directly determines the partition in the embedding space. """ # Author: Gael Varoquaux , Brian Cheung From 299fb0da9da5d6022529df23ed1ec1fe2214dab7 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 25 Oct 2021 21:27:22 -0400 Subject: [PATCH 76/90] Update doc/whats_new/v1.1.rst formatting Co-authored-by: Julien Jerphanion --- doc/whats_new/v1.1.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index ce6b15b545a94..519d73e58a7a0 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -49,9 +49,10 @@ Changelog ...................... - |Enhancement| :class:`cluster.SpectralClustering` and :func:`cluster.spectral` - now include the new 'cluster_qr' method from :func:`cluster.cluster_qr` + now include the new `'cluster_qr'` method from :func:`cluster.cluster_qr` that clusters samples in the embedding space as an alternative to the existing - 'kmeans' and 'discrete' methods. See `spectral_clustering` for more details. + `'kmeans'` and `'discrete'` methods. + See :func:`cluster.spectral_clustering` for more details. :pr:`21148` by :user:`Andrew Knyazev ` :mod:`sklearn.cross_decomposition` From c99a58e9981090e98ebf637ae28b11c62b37f5f5 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 25 Oct 2021 21:28:28 -0400 Subject: [PATCH 77/90] Update sklearn/cluster/_spectral.py minor Co-authored-by: Julien Jerphanion --- sklearn/cluster/_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 9a62ed9c639ef..58c2bb32c466f 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -272,7 +272,7 @@ def spectral_clustering( embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization [3]_. - The newest cluster_qr method [5]_ directly extracts clusters from eigenvectors + The cluster_qr method [5]_ directly extracts clusters from eigenvectors in spectral clustering. In contrast to k-means and discretization, cluster_qr has no tuning parameters and runs no iterations, yet may outperform k-means and discretization in terms of both quality and speed. From 006bc964a2ec0b0734ff3045a72551acf07dc60c Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 25 Oct 2021 21:28:56 -0400 Subject: [PATCH 78/90] Update sklearn/cluster/_spectral.py minor Co-authored-by: Julien Jerphanion --- sklearn/cluster/_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 58c2bb32c466f..4cf194d51b6a7 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -464,7 +464,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator): popular choice, but it can be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization [3]_. - The newest cluster_qr method [5]_ directly extract clusters from eigenvectors + The cluster_qr method [5]_ directly extract clusters from eigenvectors in spectral clustering. In contrast to k-means and discretization, cluster_qr has no tuning parameters and runs no iterations, yet may outperform k-means and discretization in terms of both quality and speed. From f367dbe4b254105765dcb55ab36afac1afdf1225 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 25 Oct 2021 21:29:50 -0400 Subject: [PATCH 79/90] Update sklearn/cluster/_spectral.py remove comments Co-authored-by: Julien Jerphanion --- sklearn/cluster/_spectral.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 4cf194d51b6a7..93432293f3dc6 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -9,12 +9,8 @@ import numpy as np -# Required in cluster_qr -from scipy.linalg import qr, svd - -# Required in discretize +from scipy.linalg import LinAlgError, qr, svd from scipy.sparse import csc_matrix -from scipy.linalg import LinAlgError from ..base import BaseEstimator, ClusterMixin from ..utils import check_random_state, as_float_array From 1904c87392a64b9ed3ba975c55e14d434788be48 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 25 Oct 2021 21:38:56 -0400 Subject: [PATCH 80/90] Update plot_coin_segmentation.py an author added as suggested --- examples/cluster/plot_coin_segmentation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index e599c986646e0..cf916df3167c2 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -20,7 +20,9 @@ that directly determines the partition in the embedding space. """ -# Author: Gael Varoquaux , Brian Cheung +# Author: Gael Varoquaux +# Brian Cheung +# Andrew Knyazev # License: BSD 3 clause import time From 17c4b108e33cc902720126a7806f50d4fa43d1b5 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 25 Oct 2021 21:51:39 -0400 Subject: [PATCH 81/90] Update _spectral.py doi + author --- sklearn/cluster/_spectral.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 93432293f3dc6..88386595cd2e7 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- """Algorithms for spectral clustering""" -# Author: Gael Varoquaux gael.varoquaux@normalesup.org +# Author: Gael Varoquaux # Brian Cheung # Wei LI +# Andrew Knyazev # License: BSD 3 clause import warnings @@ -42,7 +43,7 @@ def cluster_qr(vectors): ---------- .. [1] `Simple, direct, and efficient multi-way spectral clustering, 2019 Anil Damle, Victor Minden, Lexing Ying - `_ + <:doi:`10.1093/imaiai/iay008`>`_ """ @@ -305,11 +306,11 @@ def spectral_clustering( Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001. A. V. Knyazev SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. - `_ + <:doi:`10.1137/S1064827500366124`>`_ .. [5] `Simple, direct, and efficient multi-way spectral clustering, 2019 Anil Damle, Victor Minden, Lexing Ying - `_ + <:doi:`10.1093/imaiai/iay008`>`_ Notes ----- From 30cf92ffbe08a7b0bb027742066709466e77d477 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 25 Oct 2021 22:01:21 -0400 Subject: [PATCH 82/90] Update clustering.rst added a reference to cluster_qr --- doc/modules/clustering.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index cb21c21572c7b..89162dbb98254 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -522,6 +522,12 @@ below. |coin_kmeans| |coin_discretize| |coin_cluster_qr| ================================ ================================ ================================ +.. topic:: References: + + * `"Simple, direct, and efficient multi-way spectral clustering" + <:doi:`10.1093/imaiai/iay008`>`_ + Anil Damle, Victor Minden, Lexing Ying, 2019 + Spectral Clustering Graphs -------------------------- From 01367117146fc243aafef120d3490c2ac94034a3 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 25 Oct 2021 22:35:16 -0400 Subject: [PATCH 83/90] Update _spectral.py lobpcg with amg references added as requested --- sklearn/cluster/_spectral.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 88386595cd2e7..cbe9f1d7f66dd 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -237,7 +237,7 @@ def spectral_clustering( See [4]_ for more details regarding ``'lobpcg'``. Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional Algebraic MultiGrid preconditioning and requires pyamg to be installed. - It can be faster on very large sparse problems, but may be unstable. + It can be faster on very large sparse problems [6]_ and [7]_. random_state : int, RandomState instance, default=None A pseudo random number generator used for the initialization @@ -303,7 +303,7 @@ def spectral_clustering( `_ .. [4] `Toward the Optimal Preconditioned Eigensolver: - Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001. + Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001 A. V. Knyazev SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. <:doi:`10.1137/S1064827500366124`>`_ @@ -312,6 +312,16 @@ def spectral_clustering( Anil Damle, Victor Minden, Lexing Ying <:doi:`10.1093/imaiai/iay008`>`_ + .. [6] `Multiscale Spectral Image Segmentation Multiscale preconditioning + for computing eigenvalues of graph Laplacians in image segmentation, 2006 + Andrew Knyazev + <:doi:`10.13140/RG.2.2.35280.02565`>`_ + + .. [7] `Preconditioned spectral clustering for stochastic block partition + streaming graph challenge (Preliminary version at arXiv.) + David Zhuzhunashvili, Andrew Knyazev + <:doi:`10.1109/HPEC.2017.8091045`>`_ + Notes ----- The graph should contain only one connected component, elsewhere From 13b266bdda9080cda88d3c182b1ac1ba7d8130cb Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Mon, 25 Oct 2021 22:58:12 -0400 Subject: [PATCH 84/90] Update clustering.rst ' error fix --- doc/modules/clustering.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 89162dbb98254..442cc35a16e77 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -524,8 +524,7 @@ below. .. topic:: References: - * `"Simple, direct, and efficient multi-way spectral clustering" - <:doi:`10.1093/imaiai/iay008`>`_ + * :doi:`"Simple, direct, and efficient multi-way spectral clustering"<10.1093/imaiai/iay008>` Anil Damle, Victor Minden, Lexing Ying, 2019 Spectral Clustering Graphs From 64f5dc15154f991bdaf70505b68fd36531a5b174 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Tue, 26 Oct 2021 00:28:50 -0400 Subject: [PATCH 85/90] Update clustering.rst added the ``"discretize"`` reference --- doc/modules/clustering.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 442cc35a16e77..ac4807e052f66 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -523,6 +523,10 @@ below. ================================ ================================ ================================ .. topic:: References: + + * `"Multiclass spectral clustering" + `_ + Stella X. Yu, Jianbo Shi, 2003 * :doi:`"Simple, direct, and efficient multi-way spectral clustering"<10.1093/imaiai/iay008>` Anil Damle, Victor Minden, Lexing Ying, 2019 From 545f8d965f54db5ab776bf311e9961baf21cd6e6 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Fri, 29 Oct 2021 13:01:40 -0400 Subject: [PATCH 86/90] Apply suggestions from code review misc editing Co-authored-by: Julien Jerphanion --- sklearn/cluster/_spectral.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index cbe9f1d7f66dd..f96a11c177c8a 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -37,7 +37,7 @@ def cluster_qr(vectors): Returns ------- labels : array of integers, shape: n_samples - The labels of the clusters. + The cluster labels of vectors. References ---------- @@ -48,7 +48,7 @@ def cluster_qr(vectors): """ k = vectors.shape[1] - piv = qr(vectors.T, pivoting=True)[2] + _, _, piv = qr(vectors.T, pivoting=True) ut, _, v = svd(vectors[piv[:k], :].T) vectors = abs(np.dot(vectors, np.dot(ut, v.conj()))) return vectors.argmax(axis=1) @@ -271,7 +271,7 @@ def spectral_clustering( approach which is less sensitive to random initialization [3]_. The cluster_qr method [5]_ directly extracts clusters from eigenvectors in spectral clustering. In contrast to k-means and discretization, cluster_qr - has no tuning parameters and runs no iterations, yet may outperform + has no tuning parameters and is not an iterative method, yet may outperform k-means and discretization in terms of both quality and speed. .. versionchanged:: 1.1 @@ -569,7 +569,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator): .. [5] `Simple, direct, and efficient multi-way spectral clustering, 2019 Anil Damle, Victor Minden, Lexing Ying - `_ + <:doi:`10.1093/imaiai/iay008`>`_ Examples -------- From 69717220a1dc2e0fbc6c24b665dd24bbd4dd4d73 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Fri, 29 Oct 2021 14:15:05 -0400 Subject: [PATCH 87/90] Apply suggestions from code review previous missed suggestions finally committed Co-authored-by: Julien Jerphanion --- sklearn/cluster/tests/test_spectral.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 3925ac7c25684..1ef71ddfc3b48 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -197,17 +197,18 @@ def histogram(x, y, **kwargs): sp.fit(X) -@pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_cluster_qr(dtype): - # Test cluster_qr for fixed data different dtypes return the same lables +def test_cluster_qr(): + # cluster_qr for fixed data but different dtypes must return the same labels. random_state = np.random.RandomState(seed=8) - data = random_state.randn(10, 5).astype(dtype) - labels = cluster_qr(data) - assert np.array_equal(labels, np.array([2, 1, 3, 3, 2, 4, 1, 3, 4, 0])) + X_64 = random_state.randn(10, 5).astype(np.float64) + X_32 = random_state.randn(10, 5).astype(np.float32) + labels_64 = cluster_qr(X_64) + labels_32 = cluster_qr(X_32) + assert np.array_equal(labels_64, labels_32) def test_cluster_qr_permutation_invariance(): - # Test that cluster_qr is invariant to sample permutation + # cluster_qr must be invariant to sample permutation. random_state = np.random.RandomState(seed=8) n_samples, n_components = 100, 5 data = random_state.randn(n_samples, n_components) From f59971123b5bf3f9905c7076219cc00a9ef0954e Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Fri, 29 Oct 2021 14:58:17 -0400 Subject: [PATCH 88/90] Update test_spectral.py as proposed in https://github.com/scikit-learn/scikit-learn/pull/21148#discussion_r726948340 --- sklearn/cluster/tests/test_spectral.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 1ef71ddfc3b48..c7d4d02a1daac 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -198,13 +198,22 @@ def histogram(x, y, **kwargs): def test_cluster_qr(): - # cluster_qr for fixed data but different dtypes must return the same labels. + # cluster_qr by itself should not be used for clusteing generic data + # other than the rows of the eigenvectors within spectral clustering, + # but cluster_qr must still preserve the labels for different dtypes + # of the generic fixed input even if the labels may be meaningless. random_state = np.random.RandomState(seed=8) - X_64 = random_state.randn(10, 5).astype(np.float64) - X_32 = random_state.randn(10, 5).astype(np.float32) - labels_64 = cluster_qr(X_64) - labels_32 = cluster_qr(X_32) - assert np.array_equal(labels_64, labels_32) + n_samples, n_components = 10, 5 + data = random_state.randn(n_samples, n_components) + labels_float64 = cluster_qr(data.astype(np.float64)) + # Each sample is assigned a cluster identifier + assert labels_float64.shape == (n_samples,) + assert labels_float64.dtype == np.int64 + # All components should be covered by the assignment + assert np.array_equal(np.unique(labels_float64), np.arange(n_components)) + # Single precision data should yield the same cluster assignments + labels_float32 = cluster_qr(data.astype(np.float32)) + assert np.array_equal(labels_float64, labels_float32) def test_cluster_qr_permutation_invariance(): From 4be590cc75823194b521979e9fabe1e07d4c8739 Mon Sep 17 00:00:00 2001 From: Andrew Knyazev Date: Fri, 29 Oct 2021 15:43:53 -0400 Subject: [PATCH 89/90] Update test_spectral.py assert labels_float64.dtype == np.int64 naturally failed on 32-bit OS so was removed --- sklearn/cluster/tests/test_spectral.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index c7d4d02a1daac..fecb5dfcd7014 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -208,7 +208,6 @@ def test_cluster_qr(): labels_float64 = cluster_qr(data.astype(np.float64)) # Each sample is assigned a cluster identifier assert labels_float64.shape == (n_samples,) - assert labels_float64.dtype == np.int64 # All components should be covered by the assignment assert np.array_equal(np.unique(labels_float64), np.arange(n_components)) # Single precision data should yield the same cluster assignments From ff3547680f537d6665e43b1e6f1962b7c5f21f58 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 2 Nov 2021 11:31:48 +0100 Subject: [PATCH 90/90] Typo in comment --- sklearn/cluster/tests/test_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index fecb5dfcd7014..07dd4b64514ac 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -198,7 +198,7 @@ def histogram(x, y, **kwargs): def test_cluster_qr(): - # cluster_qr by itself should not be used for clusteing generic data + # cluster_qr by itself should not be used for clustering generic data # other than the rows of the eigenvectors within spectral clustering, # but cluster_qr must still preserve the labels for different dtypes # of the generic fixed input even if the labels may be meaningless.