From 0d074beb45d6f2898d67125676ac74a04b46df22 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 16 Nov 2022 17:01:45 +0100 Subject: [PATCH 01/25] First step to improve notebook style --- examples/cluster/plot_kmeans_assumptions.py | 78 ++++++++++----------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index b345197464de8..e1b4fecc6654c 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -14,56 +14,54 @@ # Author: Phil Roth # License: BSD 3 clause -import numpy as np -import matplotlib.pyplot as plt +# %% +# Data generation +# --------------- -from sklearn.cluster import KMeans +import numpy as np from sklearn.datasets import make_blobs -plt.figure(figsize=(12, 12)) - n_samples = 1500 random_state = 170 +transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]] + X, y = make_blobs(n_samples=n_samples, random_state=random_state) +X_aniso = np.dot(X, transformation) # Anisotropic blobs +X_varied, y_varied = make_blobs( + n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state +) # Unequal variance +X_filtered = np.vstack( + (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]) +) # Unevenly sized blobs -# Incorrect number of clusters -y_pred = KMeans(n_clusters=2, n_init="auto", random_state=random_state).fit_predict(X) +# %% +# Plot results +# ------------ -plt.subplot(221) -plt.scatter(X[:, 0], X[:, 1], c=y_pred) -plt.title("Incorrect Number of Blobs") +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans -# Anisotropicly distributed data -transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]] -X_aniso = np.dot(X, transformation) -y_pred = KMeans(n_clusters=3, n_init="auto", random_state=random_state).fit_predict( - X_aniso -) +common_params = { + "n_init": "auto", + "random_state": random_state, +} -plt.subplot(222) -plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) -plt.title("Anisotropicly Distributed Blobs") +fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12)) -# Different variance -X_varied, y_varied = make_blobs( - n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state -) -y_pred = KMeans(n_clusters=3, n_init="auto", random_state=random_state).fit_predict( - X_varied -) - -plt.subplot(223) -plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred) -plt.title("Unequal Variance") - -# Unevenly sized blobs -X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])) -y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict( - X_filtered -) - -plt.subplot(224) -plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred) -plt.title("Unevenly Sized Blobs") +y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X) +axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred) +axs[0, 0].set_title("Incorrect Number of Blobs") + +y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso) +axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) +axs[0, 1].set_title("Anisotropicly Distributed Blobs") + +y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_varied) +axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred) +axs[1, 0].set_title("Unequal Variance") + +y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_filtered) +axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred) +axs[1, 1].set_title("Unevenly Sized Blobs") plt.show() From 0366ba470c7f07f8cd1f2eae1ad82ac09ad065ed Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 17 Nov 2022 16:42:06 +0100 Subject: [PATCH 02/25] Improve narrative --- examples/cluster/plot_kmeans_assumptions.py | 34 ++++++++++++++++++--- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index e1b4fecc6654c..172d61f9ebac3 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -4,10 +4,23 @@ ==================================== This example is meant to illustrate situations where k-means will produce -unintuitive and possibly unexpected clusters. In the first three plots, the -input data does not conform to some implicit assumption that k-means makes and -undesirable clusters are produced as a result. In the last plot, k-means -returns intuitive clusters despite unevenly sized blobs. +unintuitive and possibly undesirable clusters. + +- Incorrect number of blobs: in a real setting there is no uniquely defined + **true** number of clusters. An appropriate number of clusters has to be + decided from data-based criteria and knowledge of aim. +- Anisotropically distributed blobs: k-means consists of minimizing sample's + euclidean distances to the centroid of the cluster they are assigned + to. As a consequence, k-means is more appropriated for clusters that are + normally distributed with a spherical covariance matrix. +- Unequal variance: k-means is equivalent to taking the maximum likelihood + estimator for a "mixture" of k gaussian distributions with the same variances + but with possibly different means. +- Unevenly sized blobs: there is no theoretical result about k-means that states + that it requires similar cluster sizes to perform well, yet minimizing + euclidean distances does mean that the more sparse and high-dimensional the + problem is, the higher is the need to run the algorithm with different + centroid seeds to ensure a minimal inertia. """ @@ -54,7 +67,7 @@ y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso) axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) -axs[0, 1].set_title("Anisotropicly Distributed Blobs") +axs[0, 1].set_title("Anisotropically Distributed Blobs") y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_varied) axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred) @@ -65,3 +78,14 @@ axs[1, 1].set_title("Unevenly Sized Blobs") plt.show() + +# %% +# For an example on how to find a correct number of blobs, see the example +# :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`. +# +# For an example on how other clustering methods deal with anisotropic or +# unequal variance blobs, see the example +# :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`. +# +# For more details on how to deal with unevenly sized blobs, see +# :ref:`kmeans_sparse_high_dim`. From 3e999368671fab23fa7bc18206b3defc7ce658c6 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 17 Nov 2022 16:42:51 +0100 Subject: [PATCH 03/25] Add possible solution --- examples/cluster/plot_kmeans_assumptions.py | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 172d61f9ebac3..7039381352a44 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -89,3 +89,31 @@ # # For more details on how to deal with unevenly sized blobs, see # :ref:`kmeans_sparse_high_dim`. + +# %% +# Possible solution +# ----------------- + +from sklearn.mixture import GaussianMixture + +fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12)) + +y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X) +axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred) +axs[0, 0].set_title("Incorrect Number of Blobs") + +y_pred = GaussianMixture(n_components=3, covariance_type="full").fit_predict(X_aniso) +axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) +axs[0, 1].set_title("Anisotropically Distributed Blobs") + +y_pred = GaussianMixture(n_components=3, covariance_type="full").fit_predict(X_varied) +axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred) +axs[1, 0].set_title("Unequal Variance") + +y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict( + X_filtered +) +axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred) +axs[1, 1].set_title("Unevenly Sized Blobs") + +plt.show() From 2bbf0d0f22840e642ecfc4ca43ab99db0a603b04 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 18 Nov 2022 11:20:03 +0100 Subject: [PATCH 04/25] Update examples/cluster/plot_kmeans_assumptions.py Co-authored-by: Tim Head --- examples/cluster/plot_kmeans_assumptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 7039381352a44..62a774b86d9e7 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -11,7 +11,7 @@ decided from data-based criteria and knowledge of aim. - Anisotropically distributed blobs: k-means consists of minimizing sample's euclidean distances to the centroid of the cluster they are assigned - to. As a consequence, k-means is more appropriated for clusters that are + to. As a consequence, k-means is more appropriate for clusters that are normally distributed with a spherical covariance matrix. - Unequal variance: k-means is equivalent to taking the maximum likelihood estimator for a "mixture" of k gaussian distributions with the same variances From 4f658b7d41ec65c41c4a900f79bc2b531ea02c1a Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 18 Nov 2022 11:57:35 +0100 Subject: [PATCH 05/25] Wording tweak --- examples/cluster/plot_kmeans_assumptions.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 62a774b86d9e7..f4126e46e65fe 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -12,7 +12,7 @@ - Anisotropically distributed blobs: k-means consists of minimizing sample's euclidean distances to the centroid of the cluster they are assigned to. As a consequence, k-means is more appropriate for clusters that are - normally distributed with a spherical covariance matrix. + isotropic and normally distributed (i.e. spherical gaussians). - Unequal variance: k-means is equivalent to taking the maximum likelihood estimator for a "mixture" of k gaussian distributions with the same variances but with possibly different means. @@ -20,7 +20,7 @@ that it requires similar cluster sizes to perform well, yet minimizing euclidean distances does mean that the more sparse and high-dimensional the problem is, the higher is the need to run the algorithm with different - centroid seeds to ensure a minimal inertia. + centroid seeds to ensure a global minimal inertia. """ @@ -30,6 +30,10 @@ # %% # Data generation # --------------- +# +# The function :func:`~sklearn.datasets.make_blobs` generates isotropic gaussian +# blobs. To obtain anisotropic (elliptical) gaussian blobs one has to define a +# linear `transformation`. import numpy as np from sklearn.datasets import make_blobs From 33f3f051482067acde334b4479d14ec7cf8c9b9f Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 18 Nov 2022 14:09:33 +0100 Subject: [PATCH 06/25] Improve narrative --- examples/cluster/plot_kmeans_assumptions.py | 22 ++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index f4126e46e65fe..3e6802c833349 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -84,19 +84,27 @@ plt.show() # %% +# Possible solution +# ----------------- +# # For an example on how to find a correct number of blobs, see the example # :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`. +# In this case it suffices to set `n_clusters=3`. +# +# To deal with unevenly sized blobs one can increase the number of random +# initializations. In this case we set `n_init=10` to avoid finding a +# sub-optimal local minimum. For more details see :ref:`kmeans_sparse_high_dim`. +# +# As anisotropic and unequal variances are real limitations of the k-means +# algorithmn, here we propose instead the use of +# :class:`~sklearn.mixture.GaussianMixture`, which also assume gaussian clusters +# but has no constraints on their variances. Notice that one still has to find +# the correct number of blobs (see +# :ref:`sphx_glr_auto_examples_model_selection_plot_gmm_selection.py`). # # For an example on how other clustering methods deal with anisotropic or # unequal variance blobs, see the example # :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`. -# -# For more details on how to deal with unevenly sized blobs, see -# :ref:`kmeans_sparse_high_dim`. - -# %% -# Possible solution -# ----------------- from sklearn.mixture import GaussianMixture From cd8da2922090bed0918dfa2ecc50740fbb8405b9 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 18 Nov 2022 14:09:55 +0100 Subject: [PATCH 07/25] Simplify code --- examples/cluster/plot_kmeans_assumptions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 3e6802c833349..ebce1dc786653 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -114,11 +114,11 @@ axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred) axs[0, 0].set_title("Incorrect Number of Blobs") -y_pred = GaussianMixture(n_components=3, covariance_type="full").fit_predict(X_aniso) +y_pred = GaussianMixture(n_components=3).fit_predict(X_aniso) axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) axs[0, 1].set_title("Anisotropically Distributed Blobs") -y_pred = GaussianMixture(n_components=3, covariance_type="full").fit_predict(X_varied) +y_pred = GaussianMixture(n_components=3).fit_predict(X_varied) axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred) axs[1, 0].set_title("Unequal Variance") From 03baf3086b4907222ca58fa643227ec26b60a785 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 18 Nov 2022 14:23:00 +0100 Subject: [PATCH 08/25] Add co-author --- examples/cluster/plot_kmeans_assumptions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index ebce1dc786653..26c4a5cc1ceed 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -25,6 +25,7 @@ """ # Author: Phil Roth +# Arturo Amor # License: BSD 3 clause # %% From 2a6952b692e612ce926507affcb00a844b42882a Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 18 Nov 2022 14:25:36 +0100 Subject: [PATCH 09/25] Tweak --- examples/cluster/plot_kmeans_assumptions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 26c4a5cc1ceed..9f06d2bb31f70 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -53,8 +53,8 @@ ) # Unevenly sized blobs # %% -# Plot results -# ------------ +# Fit models and plot results +# --------------------------- import matplotlib.pyplot as plt from sklearn.cluster import KMeans From 00a889c262ccc45f7879335baf1e0776dc293849 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 18 Nov 2022 14:37:34 +0100 Subject: [PATCH 10/25] Iter --- examples/cluster/plot_kmeans_assumptions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 9f06d2bb31f70..433c1447f5f46 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -88,7 +88,7 @@ # Possible solution # ----------------- # -# For an example on how to find a correct number of blobs, see the example +# For an example on how to find a correct number of blobs, see # :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`. # In this case it suffices to set `n_clusters=3`. # @@ -101,7 +101,7 @@ # :class:`~sklearn.mixture.GaussianMixture`, which also assume gaussian clusters # but has no constraints on their variances. Notice that one still has to find # the correct number of blobs (see -# :ref:`sphx_glr_auto_examples_model_selection_plot_gmm_selection.py`). +# :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`). # # For an example on how other clustering methods deal with anisotropic or # unequal variance blobs, see the example From 2c61da27ee0c91cda2a537b15a7e09f15119572b Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 21 Nov 2022 14:36:39 +0100 Subject: [PATCH 11/25] Add plot of generated data --- doc/modules/clustering.rst | 4 ++-- examples/cluster/plot_kmeans_assumptions.py | 26 ++++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index efdf01f55295b..7e84848e1605e 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -170,7 +170,7 @@ It suffers from various drawbacks: k-means clustering can alleviate this problem and speed up the computations. -.. image:: ../auto_examples/cluster/images/sphx_glr_plot_kmeans_assumptions_001.png +.. image:: ../auto_examples/cluster/images/sphx_glr_plot_kmeans_assumptions_002.png :target: ../auto_examples/cluster/plot_kmeans_assumptions.html :align: center :scale: 50 @@ -2110,5 +2110,5 @@ diagonal entries:: .. topic:: References - * :doi:`"Comparing Partitions" <10.1007/BF01908075>` + * :doi:`"Comparing Partitions" <10.1007/BF01908075>` L. Hubert and P. Arabie, Journal of Classification 1985 diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 433c1447f5f46..06a3e552b9787 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -51,12 +51,34 @@ X_filtered = np.vstack( (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]) ) # Unevenly sized blobs +y_filtered = [[0] * 500 + [1] * 100 + [2] * 10] + +# %% +# We can visualize the resulting data: + +import matplotlib.pyplot as plt + +fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12)) + +axs[0, 0].scatter(X[:, 0], X[:, 1], c=y) +axs[0, 0].set_title("Incorrect Number of Blobs") + +axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y) +axs[0, 1].set_title("Anisotropically Distributed Blobs") + +axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied) +axs[1, 0].set_title("Unequal Variance") + +axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered) +axs[1, 1].set_title("Unevenly Sized Blobs") + +plt.suptitle("Ground truth clusters").set_y(0.95) +plt.show() # %% # Fit models and plot results # --------------------------- -import matplotlib.pyplot as plt from sklearn.cluster import KMeans common_params = { @@ -82,6 +104,7 @@ axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred) axs[1, 1].set_title("Unevenly Sized Blobs") +plt.suptitle("Unexpected KMeans clusters").set_y(0.95) plt.show() # %% @@ -129,4 +152,5 @@ axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred) axs[1, 1].set_title("Unevenly Sized Blobs") +plt.suptitle("KMeans and gaussian mixture clusters").set_y(0.95) plt.show() From d2f850b5d9c0c7fd4af3418339db0d66cec3e3a1 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 21 Nov 2022 15:08:10 +0100 Subject: [PATCH 12/25] Delay explanation after plotting as suggested by Guillaume --- examples/cluster/plot_kmeans_assumptions.py | 43 +++++++++++---------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 06a3e552b9787..c43336d773749 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -3,25 +3,9 @@ Demonstration of k-means assumptions ==================================== -This example is meant to illustrate situations where k-means will produce +This example is meant to illustrate situations where k-means produces unintuitive and possibly undesirable clusters. -- Incorrect number of blobs: in a real setting there is no uniquely defined - **true** number of clusters. An appropriate number of clusters has to be - decided from data-based criteria and knowledge of aim. -- Anisotropically distributed blobs: k-means consists of minimizing sample's - euclidean distances to the centroid of the cluster they are assigned - to. As a consequence, k-means is more appropriate for clusters that are - isotropic and normally distributed (i.e. spherical gaussians). -- Unequal variance: k-means is equivalent to taking the maximum likelihood - estimator for a "mixture" of k gaussian distributions with the same variances - but with possibly different means. -- Unevenly sized blobs: there is no theoretical result about k-means that states - that it requires similar cluster sizes to perform well, yet minimizing - euclidean distances does mean that the more sparse and high-dimensional the - problem is, the higher is the need to run the algorithm with different - centroid seeds to ensure a global minimal inertia. - """ # Author: Phil Roth @@ -32,9 +16,9 @@ # Data generation # --------------- # -# The function :func:`~sklearn.datasets.make_blobs` generates isotropic gaussian -# blobs. To obtain anisotropic (elliptical) gaussian blobs one has to define a -# linear `transformation`. +# The function :func:`~sklearn.datasets.make_blobs` generates isotropic +# (spherical) gaussian blobs. To obtain anisotropic (elliptical) gaussian blobs +# one has to define a linear `transformation`. import numpy as np from sklearn.datasets import make_blobs @@ -78,6 +62,25 @@ # %% # Fit models and plot results # --------------------------- +# +# The previously generated data is now used for showing how +# :class:`~sklearn.cluster.KMeans` behaves in the following scenarios: +# +# - Incorrect number of blobs: in a real setting there is no uniquely defined +# **true** number of clusters. An appropriate number of clusters has to be +# decided from data-based criteria and knowledge of aim. +# - Anisotropically distributed blobs: k-means consists of minimizing sample's +# euclidean distances to the centroid of the cluster they are assigned to. As +# a consequence, k-means is more appropriate for clusters that are isotropic +# and normally distributed (i.e. spherical gaussians). +# - Unequal variance: k-means is equivalent to taking the maximum likelihood +# estimator for a "mixture" of k gaussian distributions with the same +# variances but with possibly different means. +# - Unevenly sized blobs: there is no theoretical result about k-means that +# states that it requires similar cluster sizes to perform well, yet +# minimizing euclidean distances does mean that the more sparse and +# high-dimensional the problem is, the higher is the need to run the algorithm +# with different centroid seeds to ensure a global minimal inertia. from sklearn.cluster import KMeans From 21ce3f746befc9bd1b53f0e6b46fe8ca900afa13 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 21 Nov 2022 15:08:27 +0100 Subject: [PATCH 13/25] Add concluding remark --- examples/cluster/plot_kmeans_assumptions.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index c43336d773749..d4055ac6ef558 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -157,3 +157,9 @@ plt.suptitle("KMeans and gaussian mixture clusters").set_y(0.95) plt.show() + +# %% +# In the case where clusters are known to be isotropic and have similar +# variance, the k-means algorithm is quite effective and is one of the fastest +# clustering algorithms available. This advantage is lost if one has to restart +# it several times to avoid convergence to a local minimum. From 6a7533521cb1eb15e808e39627d3b92d70fc34c7 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 21 Nov 2022 15:34:32 +0100 Subject: [PATCH 14/25] Add comment on sparse high dimensional spaces --- examples/cluster/plot_kmeans_assumptions.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index d4055ac6ef558..1e26d50c5f52e 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -159,7 +159,13 @@ plt.show() # %% -# In the case where clusters are known to be isotropic and have similar -# variance, the k-means algorithm is quite effective and is one of the fastest -# clustering algorithms available. This advantage is lost if one has to restart -# it several times to avoid convergence to a local minimum. +# In sparse high-dimensional spaces, Euclidean distances tend to become inflated +# (not shown in this example). Running a dimensionality reduction algorithm +# prior to k-means clustering can alleviate this problem and speed up the +# computations (see the example +# :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`). +# +# In the case where clusters are known to be isotropic, have similar variance +# and are not too sparse, the k-means algorithm is quite effective and is one of +# the fastest clustering algorithms available. This advantage is lost if one has +# to restart it several times to avoid convergence to a local minimum. From 1df2551913e7814daf5e60267adaa7c0d15cba42 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 21 Nov 2022 16:13:02 +0100 Subject: [PATCH 15/25] Fix shape error --- examples/cluster/plot_kmeans_assumptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 1e26d50c5f52e..1086be881e103 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -35,7 +35,7 @@ X_filtered = np.vstack( (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]) ) # Unevenly sized blobs -y_filtered = [[0] * 500 + [1] * 100 + [2] * 10] +y_filtered = [0] * 500 + [1] * 100 + [2] * 10 # %% # We can visualize the resulting data: From 08df83dfa2df7fcfc3d6cacaf59aee95e58d17c7 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 22 Nov 2022 16:48:50 +0100 Subject: [PATCH 16/25] Update examples/cluster/plot_kmeans_assumptions.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- examples/cluster/plot_kmeans_assumptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 1086be881e103..df039b59c7122 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -93,7 +93,7 @@ y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X) axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred) -axs[0, 0].set_title("Incorrect Number of Blobs") +axs[0, 0].set_title("Incorrect Number of clusters") y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso) axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) From a8be3b049b4262f9a1115b82a8de32cda028fdfd Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 23 Nov 2022 17:59:53 +0100 Subject: [PATCH 17/25] Update examples/cluster/plot_kmeans_assumptions.py Co-authored-by: Guillaume Lemaitre --- examples/cluster/plot_kmeans_assumptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index df039b59c7122..35cf415afca43 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -93,7 +93,7 @@ y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X) axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred) -axs[0, 0].set_title("Incorrect Number of clusters") +axs[0, 0].set_title("Incorrect Number of Clusters") y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso) axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) From dfada0ea9823c39fac683cdbed67587d25a4457c Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 24 Nov 2022 14:42:04 +0100 Subject: [PATCH 18/25] Address comments from Jeremie --- examples/cluster/plot_kmeans_assumptions.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 35cf415afca43..e847f9237df99 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -45,7 +45,7 @@ fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12)) axs[0, 0].scatter(X[:, 0], X[:, 1], c=y) -axs[0, 0].set_title("Incorrect Number of Blobs") +axs[0, 0].set_title("Mixture of Gaussian Blobs") axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y) axs[0, 1].set_title("Anisotropically Distributed Blobs") @@ -66,9 +66,9 @@ # The previously generated data is now used for showing how # :class:`~sklearn.cluster.KMeans` behaves in the following scenarios: # -# - Incorrect number of blobs: in a real setting there is no uniquely defined -# **true** number of clusters. An appropriate number of clusters has to be -# decided from data-based criteria and knowledge of aim. +# - Non-optimal number of clusters: in a real setting there is no uniquely +# defined **true** number of clusters. An appropriate number of clusters has +# to be decided from data-based criteria and knowledge of aim. # - Anisotropically distributed blobs: k-means consists of minimizing sample's # euclidean distances to the centroid of the cluster they are assigned to. As # a consequence, k-means is more appropriate for clusters that are isotropic @@ -93,7 +93,7 @@ y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X) axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred) -axs[0, 0].set_title("Incorrect Number of Clusters") +axs[0, 0].set_title("Non-optimal Number of Clusters") y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso) axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) @@ -139,7 +139,7 @@ y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X) axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred) -axs[0, 0].set_title("Incorrect Number of Blobs") +axs[0, 0].set_title("Optimal Number of Clusters") y_pred = GaussianMixture(n_components=3).fit_predict(X_aniso) axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) From 0c4889c3b54cadf68dc9b220e963758971f0d75d Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 1 Dec 2022 14:58:13 +0100 Subject: [PATCH 19/25] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- examples/cluster/plot_kmeans_assumptions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index e847f9237df99..370a3adff2a79 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -123,9 +123,9 @@ # sub-optimal local minimum. For more details see :ref:`kmeans_sparse_high_dim`. # # As anisotropic and unequal variances are real limitations of the k-means -# algorithmn, here we propose instead the use of -# :class:`~sklearn.mixture.GaussianMixture`, which also assume gaussian clusters -# but has no constraints on their variances. Notice that one still has to find +# algorithm, here we propose instead the use of +# :class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian clusters +# but does not impose any constraints on their variances. Notice that one still has to find # the correct number of blobs (see # :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`). # From 95dfde572015f004a42b146ead34e5a60e90bf9f Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 1 Dec 2022 15:40:01 +0100 Subject: [PATCH 20/25] Update examples/cluster/plot_kmeans_assumptions.py Co-authored-by: Tim Head --- examples/cluster/plot_kmeans_assumptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 370a3adff2a79..66567bb3cc706 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -63,7 +63,7 @@ # Fit models and plot results # --------------------------- # -# The previously generated data is now used for showing how +# The previously generated data is now used to show how # :class:`~sklearn.cluster.KMeans` behaves in the following scenarios: # # - Non-optimal number of clusters: in a real setting there is no uniquely From 30ac8f8a2f71bd34fc2a90a5518399605d3a7ddb Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 1 Dec 2022 16:28:32 +0100 Subject: [PATCH 21/25] Update examples/cluster/plot_kmeans_assumptions.py Co-authored-by: Tim Head --- examples/cluster/plot_kmeans_assumptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 66567bb3cc706..46cd8cc31772d 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -111,7 +111,7 @@ plt.show() # %% -# Possible solution +# Possible solutions # ----------------- # # For an example on how to find a correct number of blobs, see From 97287fdd3737e8001c76bd0cd446f2c62f3f5d28 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 1 Dec 2022 16:43:09 +0100 Subject: [PATCH 22/25] Fix format --- examples/cluster/plot_kmeans_assumptions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 46cd8cc31772d..4d3baf5e4cc2a 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -112,7 +112,7 @@ # %% # Possible solutions -# ----------------- +# ------------------ # # For an example on how to find a correct number of blobs, see # :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`. @@ -124,9 +124,9 @@ # # As anisotropic and unequal variances are real limitations of the k-means # algorithm, here we propose instead the use of -# :class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian clusters -# but does not impose any constraints on their variances. Notice that one still has to find -# the correct number of blobs (see +# :class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian +# clusters but does not impose any constraints on their variances. Notice that +# one still has to find the correct number of blobs (see # :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`). # # For an example on how other clustering methods deal with anisotropic or From 6589634f5d60112ed3c77a10940607c2185fcc75 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 1 Dec 2022 16:43:25 +0100 Subject: [PATCH 23/25] Improve wording --- examples/cluster/plot_kmeans_assumptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 4d3baf5e4cc2a..8902773a94cc4 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -68,7 +68,7 @@ # # - Non-optimal number of clusters: in a real setting there is no uniquely # defined **true** number of clusters. An appropriate number of clusters has -# to be decided from data-based criteria and knowledge of aim. +# to be decided from data-based criteria and knowledge of the intended goal. # - Anisotropically distributed blobs: k-means consists of minimizing sample's # euclidean distances to the centroid of the cluster they are assigned to. As # a consequence, k-means is more appropriate for clusters that are isotropic From 679e78bad26dfa68f385368ad7c2c16cd172e7c4 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Thu, 1 Dec 2022 16:55:45 +0100 Subject: [PATCH 24/25] Improve narrative --- examples/cluster/plot_kmeans_assumptions.py | 43 ++++++++++++--------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 8902773a94cc4..ae1ab3fc67799 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -117,11 +117,25 @@ # For an example on how to find a correct number of blobs, see # :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`. # In this case it suffices to set `n_clusters=3`. -# + +y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X) +plt.scatter(X[:, 0], X[:, 1], c=y_pred) +plt.title("Optimal Number of Clusters") +plt.show() + +# %% # To deal with unevenly sized blobs one can increase the number of random # initializations. In this case we set `n_init=10` to avoid finding a # sub-optimal local minimum. For more details see :ref:`kmeans_sparse_high_dim`. -# + +y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict( + X_filtered +) +plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred) +plt.title("Unevenly Sized Blobs \nwith several initializations") +plt.show() + +# %% # As anisotropic and unequal variances are real limitations of the k-means # algorithm, here we propose instead the use of # :class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian @@ -135,30 +149,23 @@ from sklearn.mixture import GaussianMixture -fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12)) - -y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X) -axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred) -axs[0, 0].set_title("Optimal Number of Clusters") +fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6)) y_pred = GaussianMixture(n_components=3).fit_predict(X_aniso) -axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) -axs[0, 1].set_title("Anisotropically Distributed Blobs") +ax1.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) +ax1.set_title("Anisotropically Distributed Blobs") y_pred = GaussianMixture(n_components=3).fit_predict(X_varied) -axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred) -axs[1, 0].set_title("Unequal Variance") +ax2.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred) +ax2.set_title("Unequal Variance") -y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict( - X_filtered -) -axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred) -axs[1, 1].set_title("Unevenly Sized Blobs") - -plt.suptitle("KMeans and gaussian mixture clusters").set_y(0.95) +plt.suptitle("Gaussian mixture clusters").set_y(0.95) plt.show() # %% +# Final remarks +# ------------- +# # In sparse high-dimensional spaces, Euclidean distances tend to become inflated # (not shown in this example). Running a dimensionality reduction algorithm # prior to k-means clustering can alleviate this problem and speed up the From 1ad8abb0759bf911a2d85e139f159c9654f36acc Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 2 Dec 2022 16:25:10 +0100 Subject: [PATCH 25/25] Update examples/cluster/plot_kmeans_assumptions.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- examples/cluster/plot_kmeans_assumptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index ae1ab3fc67799..bc1f01cb1cdd7 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -166,7 +166,7 @@ # Final remarks # ------------- # -# In sparse high-dimensional spaces, Euclidean distances tend to become inflated +# In high-dimensional spaces, Euclidean distances tend to become inflated # (not shown in this example). Running a dimensionality reduction algorithm # prior to k-means clustering can alleviate this problem and speed up the # computations (see the example