From 0d074beb45d6f2898d67125676ac74a04b46df22 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 16 Nov 2022 17:01:45 +0100
Subject: [PATCH 01/25] First step to improve notebook style

---
 examples/cluster/plot_kmeans_assumptions.py | 78 ++++++++++-----------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index b345197464de8..e1b4fecc6654c 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -14,56 +14,54 @@
 # Author: Phil Roth <mr.phil.roth@gmail.com>
 # License: BSD 3 clause
 
-import numpy as np
-import matplotlib.pyplot as plt
+# %%
+# Data generation
+# ---------------
 
-from sklearn.cluster import KMeans
+import numpy as np
 from sklearn.datasets import make_blobs
 
-plt.figure(figsize=(12, 12))
-
 n_samples = 1500
 random_state = 170
+transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
+
 X, y = make_blobs(n_samples=n_samples, random_state=random_state)
+X_aniso = np.dot(X, transformation)  # Anisotropic blobs
+X_varied, y_varied = make_blobs(
+    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
+)  # Unequal variance
+X_filtered = np.vstack(
+    (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])
+)  # Unevenly sized blobs
 
-# Incorrect number of clusters
-y_pred = KMeans(n_clusters=2, n_init="auto", random_state=random_state).fit_predict(X)
+# %%
+# Plot results
+# ------------
 
-plt.subplot(221)
-plt.scatter(X[:, 0], X[:, 1], c=y_pred)
-plt.title("Incorrect Number of Blobs")
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
 
-# Anisotropicly distributed data
-transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
-X_aniso = np.dot(X, transformation)
-y_pred = KMeans(n_clusters=3, n_init="auto", random_state=random_state).fit_predict(
-    X_aniso
-)
+common_params = {
+    "n_init": "auto",
+    "random_state": random_state,
+}
 
-plt.subplot(222)
-plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
-plt.title("Anisotropicly Distributed Blobs")
+fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
 
-# Different variance
-X_varied, y_varied = make_blobs(
-    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
-)
-y_pred = KMeans(n_clusters=3, n_init="auto", random_state=random_state).fit_predict(
-    X_varied
-)
-
-plt.subplot(223)
-plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
-plt.title("Unequal Variance")
-
-# Unevenly sized blobs
-X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
-y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(
-    X_filtered
-)
-
-plt.subplot(224)
-plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
-plt.title("Unevenly Sized Blobs")
+y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X)
+axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
+axs[0, 0].set_title("Incorrect Number of Blobs")
+
+y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso)
+axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
+axs[0, 1].set_title("Anisotropicly Distributed Blobs")
+
+y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_varied)
+axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
+axs[1, 0].set_title("Unequal Variance")
+
+y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_filtered)
+axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
+axs[1, 1].set_title("Unevenly Sized Blobs")
 
 plt.show()

From 0366ba470c7f07f8cd1f2eae1ad82ac09ad065ed Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 17 Nov 2022 16:42:06 +0100
Subject: [PATCH 02/25] Improve narrative

---
 examples/cluster/plot_kmeans_assumptions.py | 34 ++++++++++++++++++---
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index e1b4fecc6654c..172d61f9ebac3 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -4,10 +4,23 @@
 ====================================
 
 This example is meant to illustrate situations where k-means will produce
-unintuitive and possibly unexpected clusters. In the first three plots, the
-input data does not conform to some implicit assumption that k-means makes and
-undesirable clusters are produced as a result. In the last plot, k-means
-returns intuitive clusters despite unevenly sized blobs.
+unintuitive and possibly undesirable clusters.
+
+- Incorrect number of blobs: in a real setting there is no uniquely defined
+  **true** number of clusters. An appropriate number of clusters has to be
+  decided from data-based criteria and knowledge of aim.
+- Anisotropically distributed blobs: k-means consists of minimizing sample's
+  euclidean distances to the centroid of the cluster they are assigned
+  to. As a consequence, k-means is more appropriated for clusters that are
+  normally distributed with a spherical covariance matrix.
+- Unequal variance: k-means is equivalent to taking the maximum likelihood
+  estimator for a "mixture" of k gaussian distributions with the same variances
+  but with possibly different means.
+- Unevenly sized blobs: there is no theoretical result about k-means that states
+  that it requires similar cluster sizes to perform well, yet minimizing
+  euclidean distances does mean that the more sparse and high-dimensional the
+  problem is, the higher is the need to run the algorithm with different
+  centroid seeds to ensure a minimal inertia.
 
 """
 
@@ -54,7 +67,7 @@
 
 y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso)
 axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
-axs[0, 1].set_title("Anisotropicly Distributed Blobs")
+axs[0, 1].set_title("Anisotropically Distributed Blobs")
 
 y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_varied)
 axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
@@ -65,3 +78,14 @@
 axs[1, 1].set_title("Unevenly Sized Blobs")
 
 plt.show()
+
+# %%
+# For an example on how to find a correct number of blobs, see the example
+# :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
+#
+# For an example on how other clustering methods deal with anisotropic or
+# unequal variance blobs, see the example
+# :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`.
+#
+# For more details on how to deal with unevenly sized blobs, see
+# :ref:`kmeans_sparse_high_dim`.

From 3e999368671fab23fa7bc18206b3defc7ce658c6 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 17 Nov 2022 16:42:51 +0100
Subject: [PATCH 03/25] Add possible solution

---
 examples/cluster/plot_kmeans_assumptions.py | 28 +++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 172d61f9ebac3..7039381352a44 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -89,3 +89,31 @@
 #
 # For more details on how to deal with unevenly sized blobs, see
 # :ref:`kmeans_sparse_high_dim`.
+
+# %%
+# Possible solution
+# -----------------
+
+from sklearn.mixture import GaussianMixture
+
+fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
+
+y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X)
+axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
+axs[0, 0].set_title("Incorrect Number of Blobs")
+
+y_pred = GaussianMixture(n_components=3, covariance_type="full").fit_predict(X_aniso)
+axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
+axs[0, 1].set_title("Anisotropically Distributed Blobs")
+
+y_pred = GaussianMixture(n_components=3, covariance_type="full").fit_predict(X_varied)
+axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
+axs[1, 0].set_title("Unequal Variance")
+
+y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(
+    X_filtered
+)
+axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
+axs[1, 1].set_title("Unevenly Sized Blobs")
+
+plt.show()

From 2bbf0d0f22840e642ecfc4ca43ab99db0a603b04 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Fri, 18 Nov 2022 11:20:03 +0100
Subject: [PATCH 04/25] Update examples/cluster/plot_kmeans_assumptions.py

Co-authored-by: Tim Head <betatim@gmail.com>
---
 examples/cluster/plot_kmeans_assumptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 7039381352a44..62a774b86d9e7 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -11,7 +11,7 @@
   decided from data-based criteria and knowledge of aim.
 - Anisotropically distributed blobs: k-means consists of minimizing sample's
   euclidean distances to the centroid of the cluster they are assigned
-  to. As a consequence, k-means is more appropriated for clusters that are
+  to. As a consequence, k-means is more appropriate for clusters that are
   normally distributed with a spherical covariance matrix.
 - Unequal variance: k-means is equivalent to taking the maximum likelihood
   estimator for a "mixture" of k gaussian distributions with the same variances

From 4f658b7d41ec65c41c4a900f79bc2b531ea02c1a Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 18 Nov 2022 11:57:35 +0100
Subject: [PATCH 05/25] Wording tweak

---
 examples/cluster/plot_kmeans_assumptions.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 62a774b86d9e7..f4126e46e65fe 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -12,7 +12,7 @@
 - Anisotropically distributed blobs: k-means consists of minimizing sample's
   euclidean distances to the centroid of the cluster they are assigned
   to. As a consequence, k-means is more appropriate for clusters that are
-  normally distributed with a spherical covariance matrix.
+  isotropic and normally distributed (i.e. spherical gaussians).
 - Unequal variance: k-means is equivalent to taking the maximum likelihood
   estimator for a "mixture" of k gaussian distributions with the same variances
   but with possibly different means.
@@ -20,7 +20,7 @@
   that it requires similar cluster sizes to perform well, yet minimizing
   euclidean distances does mean that the more sparse and high-dimensional the
   problem is, the higher is the need to run the algorithm with different
-  centroid seeds to ensure a minimal inertia.
+  centroid seeds to ensure a global minimal inertia.
 
 """
 
@@ -30,6 +30,10 @@
 # %%
 # Data generation
 # ---------------
+#
+# The function :func:`~sklearn.datasets.make_blobs` generates isotropic gaussian
+# blobs. To obtain anisotropic (elliptical) gaussian blobs one has to define a
+# linear `transformation`.
 
 import numpy as np
 from sklearn.datasets import make_blobs

From 33f3f051482067acde334b4479d14ec7cf8c9b9f Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 18 Nov 2022 14:09:33 +0100
Subject: [PATCH 06/25] Improve narrative

---
 examples/cluster/plot_kmeans_assumptions.py | 22 ++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index f4126e46e65fe..3e6802c833349 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -84,19 +84,27 @@
 plt.show()
 
 # %%
+# Possible solution
+# -----------------
+#
 # For an example on how to find a correct number of blobs, see the example
 # :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
+# In this case it suffices to set `n_clusters=3`.
+#
+# To deal with unevenly sized blobs one can increase the number of random
+# initializations. In this case we set `n_init=10` to avoid finding a
+# sub-optimal local minimum. For more details see :ref:`kmeans_sparse_high_dim`.
+#
+# As anisotropic and unequal variances are real limitations of the k-means
+# algorithmn, here we propose instead the use of
+# :class:`~sklearn.mixture.GaussianMixture`, which also assume gaussian clusters
+# but has no constraints on their variances. Notice that one still has to find
+# the correct number of blobs (see
+# :ref:`sphx_glr_auto_examples_model_selection_plot_gmm_selection.py`).
 #
 # For an example on how other clustering methods deal with anisotropic or
 # unequal variance blobs, see the example
 # :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`.
-#
-# For more details on how to deal with unevenly sized blobs, see
-# :ref:`kmeans_sparse_high_dim`.
-
-# %%
-# Possible solution
-# -----------------
 
 from sklearn.mixture import GaussianMixture
 

From cd8da2922090bed0918dfa2ecc50740fbb8405b9 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 18 Nov 2022 14:09:55 +0100
Subject: [PATCH 07/25] Simplify code

---
 examples/cluster/plot_kmeans_assumptions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 3e6802c833349..ebce1dc786653 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -114,11 +114,11 @@
 axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
 axs[0, 0].set_title("Incorrect Number of Blobs")
 
-y_pred = GaussianMixture(n_components=3, covariance_type="full").fit_predict(X_aniso)
+y_pred = GaussianMixture(n_components=3).fit_predict(X_aniso)
 axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
 axs[0, 1].set_title("Anisotropically Distributed Blobs")
 
-y_pred = GaussianMixture(n_components=3, covariance_type="full").fit_predict(X_varied)
+y_pred = GaussianMixture(n_components=3).fit_predict(X_varied)
 axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
 axs[1, 0].set_title("Unequal Variance")
 

From 03baf3086b4907222ca58fa643227ec26b60a785 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 18 Nov 2022 14:23:00 +0100
Subject: [PATCH 08/25] Add co-author

---
 examples/cluster/plot_kmeans_assumptions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index ebce1dc786653..26c4a5cc1ceed 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -25,6 +25,7 @@
 """
 
 # Author: Phil Roth <mr.phil.roth@gmail.com>
+#         Arturo Amor <david-arturo.amor-quiroz@inria.fr>
 # License: BSD 3 clause
 
 # %%

From 2a6952b692e612ce926507affcb00a844b42882a Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 18 Nov 2022 14:25:36 +0100
Subject: [PATCH 09/25] Tweak

---
 examples/cluster/plot_kmeans_assumptions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 26c4a5cc1ceed..9f06d2bb31f70 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -53,8 +53,8 @@
 )  # Unevenly sized blobs
 
 # %%
-# Plot results
-# ------------
+# Fit models and plot results
+# ---------------------------
 
 import matplotlib.pyplot as plt
 from sklearn.cluster import KMeans

From 00a889c262ccc45f7879335baf1e0776dc293849 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Fri, 18 Nov 2022 14:37:34 +0100
Subject: [PATCH 10/25] Iter

---
 examples/cluster/plot_kmeans_assumptions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 9f06d2bb31f70..433c1447f5f46 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -88,7 +88,7 @@
 # Possible solution
 # -----------------
 #
-# For an example on how to find a correct number of blobs, see the example
+# For an example on how to find a correct number of blobs, see
 # :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
 # In this case it suffices to set `n_clusters=3`.
 #
@@ -101,7 +101,7 @@
 # :class:`~sklearn.mixture.GaussianMixture`, which also assume gaussian clusters
 # but has no constraints on their variances. Notice that one still has to find
 # the correct number of blobs (see
-# :ref:`sphx_glr_auto_examples_model_selection_plot_gmm_selection.py`).
+# :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`).
 #
 # For an example on how other clustering methods deal with anisotropic or
 # unequal variance blobs, see the example

From 2c61da27ee0c91cda2a537b15a7e09f15119572b Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 21 Nov 2022 14:36:39 +0100
Subject: [PATCH 11/25] Add plot of generated data

---
 doc/modules/clustering.rst                  |  4 ++--
 examples/cluster/plot_kmeans_assumptions.py | 26 ++++++++++++++++++++-
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index efdf01f55295b..7e84848e1605e 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -170,7 +170,7 @@ It suffers from various drawbacks:
   k-means clustering can alleviate this problem and speed up the
   computations.
 
-.. image:: ../auto_examples/cluster/images/sphx_glr_plot_kmeans_assumptions_001.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_kmeans_assumptions_002.png
    :target: ../auto_examples/cluster/plot_kmeans_assumptions.html
    :align: center
    :scale: 50
@@ -2110,5 +2110,5 @@ diagonal entries::
 
 .. topic:: References
 
- * :doi:`"Comparing Partitions" <10.1007/BF01908075>` 
+ * :doi:`"Comparing Partitions" <10.1007/BF01908075>`
    L. Hubert and P. Arabie, Journal of Classification 1985
diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 433c1447f5f46..06a3e552b9787 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -51,12 +51,34 @@
 X_filtered = np.vstack(
     (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])
 )  # Unevenly sized blobs
+y_filtered = [[0] * 500 + [1] * 100 + [2] * 10]
+
+# %%
+# We can visualize the resulting data:
+
+import matplotlib.pyplot as plt
+
+fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
+
+axs[0, 0].scatter(X[:, 0], X[:, 1], c=y)
+axs[0, 0].set_title("Incorrect Number of Blobs")
+
+axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y)
+axs[0, 1].set_title("Anisotropically Distributed Blobs")
+
+axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied)
+axs[1, 0].set_title("Unequal Variance")
+
+axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered)
+axs[1, 1].set_title("Unevenly Sized Blobs")
+
+plt.suptitle("Ground truth clusters").set_y(0.95)
+plt.show()
 
 # %%
 # Fit models and plot results
 # ---------------------------
 
-import matplotlib.pyplot as plt
 from sklearn.cluster import KMeans
 
 common_params = {
@@ -82,6 +104,7 @@
 axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
 axs[1, 1].set_title("Unevenly Sized Blobs")
 
+plt.suptitle("Unexpected KMeans clusters").set_y(0.95)
 plt.show()
 
 # %%
@@ -129,4 +152,5 @@
 axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
 axs[1, 1].set_title("Unevenly Sized Blobs")
 
+plt.suptitle("KMeans and gaussian mixture clusters").set_y(0.95)
 plt.show()

From d2f850b5d9c0c7fd4af3418339db0d66cec3e3a1 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 21 Nov 2022 15:08:10 +0100
Subject: [PATCH 12/25] Delay explanation after plotting as suggested by
 Guillaume

---
 examples/cluster/plot_kmeans_assumptions.py | 43 +++++++++++----------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 06a3e552b9787..c43336d773749 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -3,25 +3,9 @@
 Demonstration of k-means assumptions
 ====================================
 
-This example is meant to illustrate situations where k-means will produce
+This example is meant to illustrate situations where k-means produces
 unintuitive and possibly undesirable clusters.
 
-- Incorrect number of blobs: in a real setting there is no uniquely defined
-  **true** number of clusters. An appropriate number of clusters has to be
-  decided from data-based criteria and knowledge of aim.
-- Anisotropically distributed blobs: k-means consists of minimizing sample's
-  euclidean distances to the centroid of the cluster they are assigned
-  to. As a consequence, k-means is more appropriate for clusters that are
-  isotropic and normally distributed (i.e. spherical gaussians).
-- Unequal variance: k-means is equivalent to taking the maximum likelihood
-  estimator for a "mixture" of k gaussian distributions with the same variances
-  but with possibly different means.
-- Unevenly sized blobs: there is no theoretical result about k-means that states
-  that it requires similar cluster sizes to perform well, yet minimizing
-  euclidean distances does mean that the more sparse and high-dimensional the
-  problem is, the higher is the need to run the algorithm with different
-  centroid seeds to ensure a global minimal inertia.
-
 """
 
 # Author: Phil Roth <mr.phil.roth@gmail.com>
@@ -32,9 +16,9 @@
 # Data generation
 # ---------------
 #
-# The function :func:`~sklearn.datasets.make_blobs` generates isotropic gaussian
-# blobs. To obtain anisotropic (elliptical) gaussian blobs one has to define a
-# linear `transformation`.
+# The function :func:`~sklearn.datasets.make_blobs` generates isotropic
+# (spherical) gaussian blobs. To obtain anisotropic (elliptical) gaussian blobs
+# one has to define a linear `transformation`.
 
 import numpy as np
 from sklearn.datasets import make_blobs
@@ -78,6 +62,25 @@
 # %%
 # Fit models and plot results
 # ---------------------------
+#
+# The previously generated data is now used for showing how
+# :class:`~sklearn.cluster.KMeans` behaves in the following scenarios:
+#
+# - Incorrect number of blobs: in a real setting there is no uniquely defined
+#   **true** number of clusters. An appropriate number of clusters has to be
+#   decided from data-based criteria and knowledge of aim.
+# - Anisotropically distributed blobs: k-means consists of minimizing sample's
+#   euclidean distances to the centroid of the cluster they are assigned to. As
+#   a consequence, k-means is more appropriate for clusters that are isotropic
+#   and normally distributed (i.e. spherical gaussians).
+# - Unequal variance: k-means is equivalent to taking the maximum likelihood
+#   estimator for a "mixture" of k gaussian distributions with the same
+#   variances but with possibly different means.
+# - Unevenly sized blobs: there is no theoretical result about k-means that
+#   states that it requires similar cluster sizes to perform well, yet
+#   minimizing euclidean distances does mean that the more sparse and
+#   high-dimensional the problem is, the higher is the need to run the algorithm
+#   with different centroid seeds to ensure a global minimal inertia.
 
 from sklearn.cluster import KMeans
 

From 21ce3f746befc9bd1b53f0e6b46fe8ca900afa13 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 21 Nov 2022 15:08:27 +0100
Subject: [PATCH 13/25] Add concluding remark

---
 examples/cluster/plot_kmeans_assumptions.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index c43336d773749..d4055ac6ef558 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -157,3 +157,9 @@
 
 plt.suptitle("KMeans and gaussian mixture clusters").set_y(0.95)
 plt.show()
+
+# %%
+# In the case where clusters are known to be isotropic and have similar
+# variance, the k-means algorithm is quite effective and is one of the fastest
+# clustering algorithms available. This advantage is lost if one has to restart
+# it several times to avoid convergence to a local minimum.

From 6a7533521cb1eb15e808e39627d3b92d70fc34c7 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 21 Nov 2022 15:34:32 +0100
Subject: [PATCH 14/25] Add comment on sparse high dimensional spaces

---
 examples/cluster/plot_kmeans_assumptions.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index d4055ac6ef558..1e26d50c5f52e 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -159,7 +159,13 @@
 plt.show()
 
 # %%
-# In the case where clusters are known to be isotropic and have similar
-# variance, the k-means algorithm is quite effective and is one of the fastest
-# clustering algorithms available. This advantage is lost if one has to restart
-# it several times to avoid convergence to a local minimum.
+# In sparse high-dimensional spaces, Euclidean distances tend to become inflated
+# (not shown in this example). Running a dimensionality reduction algorithm
+# prior to k-means clustering can alleviate this problem and speed up the
+# computations (see the example
+# :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`).
+#
+# In the case where clusters are known to be isotropic, have similar variance
+# and are not too sparse, the k-means algorithm is quite effective and is one of
+# the fastest clustering algorithms available. This advantage is lost if one has
+# to restart it several times to avoid convergence to a local minimum.

From 1df2551913e7814daf5e60267adaa7c0d15cba42 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Mon, 21 Nov 2022 16:13:02 +0100
Subject: [PATCH 15/25] Fix shape error

---
 examples/cluster/plot_kmeans_assumptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 1e26d50c5f52e..1086be881e103 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -35,7 +35,7 @@
 X_filtered = np.vstack(
     (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])
 )  # Unevenly sized blobs
-y_filtered = [[0] * 500 + [1] * 100 + [2] * 10]
+y_filtered = [0] * 500 + [1] * 100 + [2] * 10
 
 # %%
 # We can visualize the resulting data:

From 08df83dfa2df7fcfc3d6cacaf59aee95e58d17c7 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Tue, 22 Nov 2022 16:48:50 +0100
Subject: [PATCH 16/25] Update examples/cluster/plot_kmeans_assumptions.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 examples/cluster/plot_kmeans_assumptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 1086be881e103..df039b59c7122 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -93,7 +93,7 @@
 
 y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X)
 axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
-axs[0, 0].set_title("Incorrect Number of Blobs")
+axs[0, 0].set_title("Incorrect Number of clusters")
 
 y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso)
 axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)

From a8be3b049b4262f9a1115b82a8de32cda028fdfd Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Wed, 23 Nov 2022 17:59:53 +0100
Subject: [PATCH 17/25] Update examples/cluster/plot_kmeans_assumptions.py

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 examples/cluster/plot_kmeans_assumptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index df039b59c7122..35cf415afca43 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -93,7 +93,7 @@
 
 y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X)
 axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
-axs[0, 0].set_title("Incorrect Number of clusters")
+axs[0, 0].set_title("Incorrect Number of Clusters")
 
 y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso)
 axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)

From dfada0ea9823c39fac683cdbed67587d25a4457c Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 24 Nov 2022 14:42:04 +0100
Subject: [PATCH 18/25] Address comments from Jeremie

---
 examples/cluster/plot_kmeans_assumptions.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 35cf415afca43..e847f9237df99 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -45,7 +45,7 @@
 fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
 
 axs[0, 0].scatter(X[:, 0], X[:, 1], c=y)
-axs[0, 0].set_title("Incorrect Number of Blobs")
+axs[0, 0].set_title("Mixture of Gaussian Blobs")
 
 axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y)
 axs[0, 1].set_title("Anisotropically Distributed Blobs")
@@ -66,9 +66,9 @@
 # The previously generated data is now used for showing how
 # :class:`~sklearn.cluster.KMeans` behaves in the following scenarios:
 #
-# - Incorrect number of blobs: in a real setting there is no uniquely defined
-#   **true** number of clusters. An appropriate number of clusters has to be
-#   decided from data-based criteria and knowledge of aim.
+# - Non-optimal number of clusters: in a real setting there is no uniquely
+#   defined **true** number of clusters. An appropriate number of clusters has
+#   to be decided from data-based criteria and knowledge of aim.
 # - Anisotropically distributed blobs: k-means consists of minimizing sample's
 #   euclidean distances to the centroid of the cluster they are assigned to. As
 #   a consequence, k-means is more appropriate for clusters that are isotropic
@@ -93,7 +93,7 @@
 
 y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X)
 axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
-axs[0, 0].set_title("Incorrect Number of Clusters")
+axs[0, 0].set_title("Non-optimal Number of Clusters")
 
 y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso)
 axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
@@ -139,7 +139,7 @@
 
 y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X)
 axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
-axs[0, 0].set_title("Incorrect Number of Blobs")
+axs[0, 0].set_title("Optimal Number of Clusters")
 
 y_pred = GaussianMixture(n_components=3).fit_predict(X_aniso)
 axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)

From 0c4889c3b54cadf68dc9b220e963758971f0d75d Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Thu, 1 Dec 2022 14:58:13 +0100
Subject: [PATCH 19/25] Apply suggestions from code review

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 examples/cluster/plot_kmeans_assumptions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index e847f9237df99..370a3adff2a79 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -123,9 +123,9 @@
 # sub-optimal local minimum. For more details see :ref:`kmeans_sparse_high_dim`.
 #
 # As anisotropic and unequal variances are real limitations of the k-means
-# algorithmn, here we propose instead the use of
-# :class:`~sklearn.mixture.GaussianMixture`, which also assume gaussian clusters
-# but has no constraints on their variances. Notice that one still has to find
+# algorithm, here we propose instead the use of
+# :class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian clusters
+# but does not impose any constraints on their variances. Notice that one still has to find
 # the correct number of blobs (see
 # :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`).
 #

From 95dfde572015f004a42b146ead34e5a60e90bf9f Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Thu, 1 Dec 2022 15:40:01 +0100
Subject: [PATCH 20/25] Update examples/cluster/plot_kmeans_assumptions.py

Co-authored-by: Tim Head <betatim@gmail.com>
---
 examples/cluster/plot_kmeans_assumptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 370a3adff2a79..66567bb3cc706 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -63,7 +63,7 @@
 # Fit models and plot results
 # ---------------------------
 #
-# The previously generated data is now used for showing how
+# The previously generated data is now used to show how
 # :class:`~sklearn.cluster.KMeans` behaves in the following scenarios:
 #
 # - Non-optimal number of clusters: in a real setting there is no uniquely

From 30ac8f8a2f71bd34fc2a90a5518399605d3a7ddb Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Thu, 1 Dec 2022 16:28:32 +0100
Subject: [PATCH 21/25] Update examples/cluster/plot_kmeans_assumptions.py

Co-authored-by: Tim Head <betatim@gmail.com>
---
 examples/cluster/plot_kmeans_assumptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 66567bb3cc706..46cd8cc31772d 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -111,7 +111,7 @@
 plt.show()
 
 # %%
-# Possible solution
+# Possible solutions
 # -----------------
 #
 # For an example on how to find a correct number of blobs, see

From 97287fdd3737e8001c76bd0cd446f2c62f3f5d28 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 1 Dec 2022 16:43:09 +0100
Subject: [PATCH 22/25] Fix format

---
 examples/cluster/plot_kmeans_assumptions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 46cd8cc31772d..4d3baf5e4cc2a 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -112,7 +112,7 @@
 
 # %%
 # Possible solutions
-# -----------------
+# ------------------
 #
 # For an example on how to find a correct number of blobs, see
 # :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
@@ -124,9 +124,9 @@
 #
 # As anisotropic and unequal variances are real limitations of the k-means
 # algorithm, here we propose instead the use of
-# :class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian clusters
-# but does not impose any constraints on their variances. Notice that one still has to find
-# the correct number of blobs (see
+# :class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian
+# clusters but does not impose any constraints on their variances. Notice that
+# one still has to find the correct number of blobs (see
 # :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`).
 #
 # For an example on how other clustering methods deal with anisotropic or

From 6589634f5d60112ed3c77a10940607c2185fcc75 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 1 Dec 2022 16:43:25 +0100
Subject: [PATCH 23/25] Improve wording

---
 examples/cluster/plot_kmeans_assumptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 4d3baf5e4cc2a..8902773a94cc4 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -68,7 +68,7 @@
 #
 # - Non-optimal number of clusters: in a real setting there is no uniquely
 #   defined **true** number of clusters. An appropriate number of clusters has
-#   to be decided from data-based criteria and knowledge of aim.
+#   to be decided from data-based criteria and knowledge of the intended goal.
 # - Anisotropically distributed blobs: k-means consists of minimizing sample's
 #   euclidean distances to the centroid of the cluster they are assigned to. As
 #   a consequence, k-means is more appropriate for clusters that are isotropic

From 679e78bad26dfa68f385368ad7c2c16cd172e7c4 Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Thu, 1 Dec 2022 16:55:45 +0100
Subject: [PATCH 24/25] Improve narrative

---
 examples/cluster/plot_kmeans_assumptions.py | 43 ++++++++++++---------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 8902773a94cc4..ae1ab3fc67799 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -117,11 +117,25 @@
 # For an example on how to find a correct number of blobs, see
 # :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
 # In this case it suffices to set `n_clusters=3`.
-#
+
+y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X)
+plt.scatter(X[:, 0], X[:, 1], c=y_pred)
+plt.title("Optimal Number of Clusters")
+plt.show()
+
+# %%
 # To deal with unevenly sized blobs one can increase the number of random
 # initializations. In this case we set `n_init=10` to avoid finding a
 # sub-optimal local minimum. For more details see :ref:`kmeans_sparse_high_dim`.
-#
+
+y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(
+    X_filtered
+)
+plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
+plt.title("Unevenly Sized Blobs \nwith several initializations")
+plt.show()
+
+# %%
 # As anisotropic and unequal variances are real limitations of the k-means
 # algorithm, here we propose instead the use of
 # :class:`~sklearn.mixture.GaussianMixture`, which also assumes gaussian
@@ -135,30 +149,23 @@
 
 from sklearn.mixture import GaussianMixture
 
-fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
-
-y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X)
-axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
-axs[0, 0].set_title("Optimal Number of Clusters")
+fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
 
 y_pred = GaussianMixture(n_components=3).fit_predict(X_aniso)
-axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
-axs[0, 1].set_title("Anisotropically Distributed Blobs")
+ax1.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
+ax1.set_title("Anisotropically Distributed Blobs")
 
 y_pred = GaussianMixture(n_components=3).fit_predict(X_varied)
-axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
-axs[1, 0].set_title("Unequal Variance")
+ax2.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
+ax2.set_title("Unequal Variance")
 
-y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(
-    X_filtered
-)
-axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
-axs[1, 1].set_title("Unevenly Sized Blobs")
-
-plt.suptitle("KMeans and gaussian mixture clusters").set_y(0.95)
+plt.suptitle("Gaussian mixture clusters").set_y(0.95)
 plt.show()
 
 # %%
+# Final remarks
+# -------------
+#
 # In sparse high-dimensional spaces, Euclidean distances tend to become inflated
 # (not shown in this example). Running a dimensionality reduction algorithm
 # prior to k-means clustering can alleviate this problem and speed up the

From 1ad8abb0759bf911a2d85e139f159c9654f36acc Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Fri, 2 Dec 2022 16:25:10 +0100
Subject: [PATCH 25/25] Update examples/cluster/plot_kmeans_assumptions.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 examples/cluster/plot_kmeans_assumptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index ae1ab3fc67799..bc1f01cb1cdd7 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -166,7 +166,7 @@
 # Final remarks
 # -------------
 #
-# In sparse high-dimensional spaces, Euclidean distances tend to become inflated
+# In high-dimensional spaces, Euclidean distances tend to become inflated
 # (not shown in this example). Running a dimensionality reduction algorithm
 # prior to k-means clustering can alleviate this problem and speed up the
 # computations (see the example