From eb7ff43d9f50f331c4594df04a7f325b9587e3d9 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Tue, 26 Nov 2024 18:46:26 +0800 Subject: [PATCH 1/4] Add ref --- doc/modules/clustering.rst | 6 ++++++ sklearn/cluster/_kmeans.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 7cf593baf20d1..55d026111fe77 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -244,6 +244,9 @@ to the dataset :math:`X`. * :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_plusplus.py`: Using K-means++ to select seeds for other clustering algorithms. +* :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`: + Evaluating the impact of k-means initializations to convergence. + Low-level parallelism --------------------- @@ -311,6 +314,9 @@ small, as shown in the example and cited reference. * :ref:`sphx_glr_auto_examples_cluster_plot_dict_face_patches.py` +* :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`: + Evaluating the impact of k-means initializations to convergence. + .. dropdown:: References * `"Web Scale K-Means clustering" diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 4fdcb4d5eea0f..3a5fc53d1edf4 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1216,6 +1216,9 @@ class KMeans(_BaseKMeans): For an example of how to use the different `init` strategy, see the example entitled :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`. + For an evaluation of the impact of initialization, see the example + :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`. + n_init : 'auto' or int, default='auto' Number of times the k-means algorithm is run with different centroid seeds. The final results is the best output of `n_init` consecutive runs @@ -1700,6 +1703,9 @@ class MiniBatchKMeans(_BaseKMeans): If a callable is passed, it should take arguments X, n_clusters and a random state and return an initialization. + For an evaluation of the impact of initialization, see the example + :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`. + max_iter : int, default=100 Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion heuristics. From 563d15281cc678c5eae562a73614d490c405d967 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Tue, 26 Nov 2024 20:38:53 +0800 Subject: [PATCH 2/4] Update sklearn/cluster/_kmeans.py Co-authored-by: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> --- sklearn/cluster/_kmeans.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 3a5fc53d1edf4..dba4388d0100c 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1213,8 +1213,8 @@ class KMeans(_BaseKMeans): * If a callable is passed, it should take arguments X, n_clusters and a\ random state and return an initialization. - For an example of how to use the different `init` strategy, see the example - entitled :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`. + For an example of how to use the different `init` strategies, see + :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`. For an evaluation of the impact of initialization, see the example :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`. From 77d55a715f5c89fd3eb6a3d936a9a4a2df363ab9 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Tue, 26 Nov 2024 20:39:07 +0800 Subject: [PATCH 3/4] Update doc/modules/clustering.rst Co-authored-by: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> --- doc/modules/clustering.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 55d026111fe77..283aaba24b5e8 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -314,9 +314,6 @@ small, as shown in the example and cited reference. * :ref:`sphx_glr_auto_examples_cluster_plot_dict_face_patches.py` -* :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`: - Evaluating the impact of k-means initializations to convergence. - .. dropdown:: References * `"Web Scale K-Means clustering" From f1859f4884ff3c5925a9aa5cad7b1b141e7bab1c Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Tue, 26 Nov 2024 20:42:59 +0800 Subject: [PATCH 4/4] Add link with context --- doc/modules/clustering.rst | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 283aaba24b5e8..53e09829c1d41 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -222,9 +222,10 @@ initializations of the centroids. One method to help address this issue is the k-means++ initialization scheme, which has been implemented in scikit-learn (use the ``init='k-means++'`` parameter). This initializes the centroids to be (generally) distant from each other, leading to probably better results than -random initialization, as shown in the reference. For a detailed example of -comaparing different initialization schemes, refer to -:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`. +random initialization, as shown in the reference. For detailed examples of +comparing different initialization schemes, refer to +:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py` and +:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`. K-means++ can also be called independently to select seeds for other clustering algorithms, see :func:`sklearn.cluster.kmeans_plusplus` for details @@ -244,9 +245,6 @@ to the dataset :math:`X`. * :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_plusplus.py`: Using K-means++ to select seeds for other clustering algorithms. -* :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`: - Evaluating the impact of k-means initializations to convergence. - Low-level parallelism ---------------------