From 9302b292caefdc67d822d5a1bca1bbc608925b9c Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 7 Feb 2023 11:53:26 +0100 Subject: [PATCH 1/2] ensure 2 bins --- doc/whats_new/v1.2.rst | 7 +++++++ sklearn/cluster/_bisect_k_means.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index c572acf49370c..fdb48fce032f5 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -27,6 +27,13 @@ Changes impacting all modules Changelog --------- +:mod:`sklearn.cluster` +...................... + +- |Fix| Fixed a bug in :class:`cluster.BisectingKMeans`, preventing `fit` to randomly + fail due to a permutation of the labels when running multiple inits. + :pr:`25563` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.isotonic` ....................... diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py index 277d88b1d1109..a31086fe2cdac 100644 --- a/sklearn/cluster/_bisect_k_means.py +++ b/sklearn/cluster/_bisect_k_means.py @@ -337,7 +337,7 @@ def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect): X, best_centers, best_labels, sample_weight ) else: # bisecting_strategy == "largest_cluster" - scores = np.bincount(best_labels) + scores = np.bincount(best_labels, minlength=2) cluster_to_bisect.split(best_labels, best_centers, scores) From be2427e01123b87d7ab7907c4044bf4261fdd3f7 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 7 Feb 2023 14:40:51 +0100 Subject: [PATCH 2/2] add comment --- sklearn/cluster/_bisect_k_means.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py index a31086fe2cdac..b860596c03540 100644 --- a/sklearn/cluster/_bisect_k_means.py +++ b/sklearn/cluster/_bisect_k_means.py @@ -337,6 +337,8 @@ def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect): X, best_centers, best_labels, sample_weight ) else: # bisecting_strategy == "largest_cluster" + # Using minlength to make sure that we have the counts for both labels even + # if all samples are labelled 0. scores = np.bincount(best_labels, minlength=2) cluster_to_bisect.split(best_labels, best_centers, scores)