From 6f757184d093865daa1f7ba16af0af85f8338af7 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 15 Jul 2020 15:06:26 +0200
Subject: [PATCH 1/2] refactor center init in KMeans

---
 sklearn/cluster/_kmeans.py            | 324 ++++++++++----------------
 sklearn/cluster/tests/test_k_means.py |  12 -
 2 files changed, 123 insertions(+), 213 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index d0c9ba84a0fa4..9ed07ec752fe3 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -144,18 +144,6 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
 ###############################################################################
 # K-means batch estimation by EM (expectation maximization)
 
-def _validate_center_shape(X, n_centers, centers):
-    """Check if centers is compatible with X and n_centers"""
-    if centers.shape[0] != n_centers:
-        raise ValueError(
-            f"The shape of the initial centers {centers.shape} does not "
-            f"match the number of clusters {n_centers}.")
-    if centers.shape[1] != X.shape[1]:
-        raise ValueError(
-            f"The shape of the initial centers {centers.shape} does not "
-            f"match the number of features of the data {X.shape[1]}.")
-
-
 def _tolerance(X, tol):
     """Return a tolerance which is independent of the dataset"""
     if tol == 0:
@@ -308,9 +296,9 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
         return est.cluster_centers_, est.labels_, est.inertia_
 
 
-def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
-                         init='k-means++', verbose=False, x_squared_norms=None,
-                         random_state=None, tol=1e-4, n_threads=1):
+def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
+                         verbose=False, x_squared_norms=None, tol=1e-4,
+                         n_threads=1):
     """A single run of k-means elkan, assumes preparation completed prior.
 
     Parameters
@@ -321,40 +309,18 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
     sample_weight : array-like of shape (n_samples,)
         The weights for each observation in X.
 
-    n_clusters : int
-        The number of clusters to form as well as the number of
-        centroids to generate.
+    centers_init : ndarray of shape (n_clusters, n_features)
+        The initial centers.
 
     max_iter : int, default=300
         Maximum number of iterations of the k-means algorithm to run.
 
-    init : {'k-means++', 'random', ndarray, callable}, default='k-means++'
-        Method for initialization:
-
-        'k-means++' : selects initial cluster centers for k-mean
-        clustering in a smart way to speed up convergence. See section
-        Notes in k_init for more details.
-
-        'random': choose `n_clusters` observations (rows) at random from data
-        for the initial centroids.
-
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
-        and gives the initial centers.
-
-        If a callable is passed, it should take arguments X, n_clusters and a
-        random state and return an initialization.
-
     verbose : bool, default=False
         Verbosity mode
 
     x_squared_norms : array-like, default=None
         Precomputed x_squared_norms.
 
-    random_state : int, RandomState instance, default=None
-        Determines random number generation for centroid initialization. Use
-        an int to make the randomness deterministic.
-        See :term:`Glossary <random_state>`.
-
     tol : float, default=1e-4
         Relative tolerance with regards to Frobenius norm of the difference
         in the cluster centers of two consecutive iterations to declare
@@ -383,18 +349,11 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
     n_iter : int
         Number of iterations run.
     """
-    random_state = check_random_state(random_state)
-    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
-
-    # init
-    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
-                              x_squared_norms=x_squared_norms)
-
-    if verbose:
-        print('Initialization complete')
-
     n_samples = X.shape[0]
+    n_clusters = centers_init.shape[0]
 
+    # Buffers to avoid new allocations at each iteration.
+    centers = centers_init
     centers_new = np.zeros_like(centers)
     weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
     labels = np.full(n_samples, -1, dtype=np.int32)
@@ -431,18 +390,17 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
 
         if verbose:
             inertia = _inertia(X, sample_weight, centers, labels)
-            print("Iteration {0}, inertia {1}" .format(i, inertia))
+            print(f"Iteration {i}, inertia {inertia}")
+
+        centers, centers_new = centers_new, centers
 
         center_shift_tot = (center_shift**2).sum()
         if center_shift_tot <= tol:
             if verbose:
-                print("Converged at iteration {0}: "
-                      "center shift {1} within tolerance {2}"
-                      .format(i, center_shift_tot, tol))
+                print(f"Converged at iteration {i}: center shift "
+                      f"{center_shift_tot} within tolerance {tol}.")
             break
 
-        centers, centers_new = centers_new, centers
-
     if center_shift_tot > 0:
         # rerun E-step so that predicted labels match cluster centers
         elkan_iter(X, sample_weight, centers, centers, weight_in_clusters,
@@ -455,9 +413,9 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
     return labels, inertia, centers, i + 1
 
 
-def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
-                         init='k-means++', verbose=False, x_squared_norms=None,
-                         random_state=None, tol=1e-4, n_threads=1):
+def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
+                         verbose=False, x_squared_norms=None, tol=1e-4,
+                         n_threads=1):
     """A single run of k-means lloyd, assumes preparation completed prior.
 
     Parameters
@@ -468,40 +426,18 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
     sample_weight : ndarray of shape (n_samples,)
         The weights for each observation in X.
 
-    n_clusters : int
-        The number of clusters to form as well as the number of
-        centroids to generate.
+    centers_init : ndarray of shape (n_clusters, n_features)
+        The initial centers.
 
     max_iter : int, default=300
         Maximum number of iterations of the k-means algorithm to run.
 
-    init : {'k-means++', 'random', ndarray, callable}, default='k-means++'
-        Method for initialization:
-
-        'k-means++' : selects initial cluster centers for k-mean
-        clustering in a smart way to speed up convergence. See section
-        Notes in k_init for more details.
-
-        'random': choose `n_clusters` observations (rows) at random from data
-        for the initial centroids.
-
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
-        and gives the initial centers.
-
-        If a callable is passed, it should take arguments X, n_clusters and a
-        random state and return an initialization.
-
     verbose : bool, default=False
         Verbosity mode
 
     x_squared_norms : ndarray of shape(n_samples,), default=None
         Precomputed x_squared_norms.
 
-    random_state : int, RandomState instance or None, default=None
-        Determines random number generation for centroid initialization. Use
-        an int to make the randomness deterministic.
-        See :term:`Glossary <random_state>`.
-
     tol : float, default=1e-4
         Relative tolerance with regards to Frobenius norm of the difference
         in the cluster centers of two consecutive iterations to declare
@@ -530,16 +466,10 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
     n_iter : int
         Number of iterations run.
     """
-    random_state = check_random_state(random_state)
-    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
-
-    # init
-    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
-                              x_squared_norms=x_squared_norms)
-
-    if verbose:
-        print("Initialization complete")
+    n_clusters = centers_init.shape[0]
 
+    # Buffers to avoid new allocations at each iteration.
+    centers = centers_init
     centers_new = np.zeros_like(centers)
     labels = np.full(X.shape[0], -1, dtype=np.int32)
     weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
@@ -561,7 +491,9 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
 
             if verbose:
                 inertia = _inertia(X, sample_weight, centers, labels)
-                print("Iteration {0}, inertia {1}" .format(i, inertia))
+                print(f"Iteration {i}, inertia {inertia}.")
+
+            centers, centers_new = centers_new, centers
 
             center_shift_tot = (center_shift**2).sum()
             if center_shift_tot <= tol:
@@ -571,8 +503,6 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
                           .format(i, center_shift_tot, tol))
                 break
 
-            centers, centers_new = centers_new, centers
-
         if center_shift_tot > 0:
             # rerun E-step so that predicted labels match cluster centers
             lloyd_iter(X, sample_weight, x_squared_norms, centers, centers,
@@ -592,11 +522,11 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
 
     Parameters
     ----------
-    X : {array-like, sparse matrix} of shape (n_samples, n_features)
-        The input samples to assign to the labels. If sparse matrix, must be in
-        CSR format.
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The input samples to assign to the labels. If sparse matrix, must
+        be in CSR format.
 
-    sample_weight : array-like of shape (n_samples,)
+    sample_weight : ndarray of shape (n_samples,)
         The weights for each observation in X.
 
     x_squared_norms : ndarray of shape (n_samples,)
@@ -614,7 +544,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
     Returns
     -------
     labels : ndarray of shape (n_samples,)
-        The resulting assignment
+        The resulting assignment.
 
     inertia : float
         Sum of squared distances of samples to their closest cluster center.
@@ -624,7 +554,6 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
 
     n_threads = _openmp_effective_n_threads(n_threads)
 
-    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
     labels = np.full(n_samples, -1, dtype=np.int32)
     weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype)
     center_shift = np.zeros_like(weight_in_clusters)
@@ -645,88 +574,6 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
     return labels, inertia
 
 
-def _init_centroids(X, n_clusters=8, init="k-means++", random_state=None,
-                    x_squared_norms=None, init_size=None):
-    """Compute the initial centroids
-
-    Parameters
-    ----------
-
-    X : {ndarray, spare matrix} of shape (n_samples, n_features)
-        The input samples.
-
-    n_clusters : int, default=8
-        number of centroids.
-
-    init : {'k-means++', 'random', ndarray, callable}, default="k-means++"
-        Method for initialization.
-
-    random_state : int, RandomState instance, default=None
-        Determines random number generation for centroid initialization. Use
-        an int to make the randomness deterministic.
-        See :term:`Glossary <random_state>`.
-
-    x_squared_norms : ndarray of shape (n_samples,), default=None
-        Squared euclidean norm of each data point. Pass it if you have it at
-        hands already to avoid it being recomputed here. Default: None
-
-    init_size : int, default=None
-        Number of samples to randomly sample for speeding up the
-        initialization (sometimes at the expense of accuracy): the
-        only algorithm is initialized by running a batch KMeans on a
-        random subset of the data. This needs to be larger than k.
-
-    Returns
-    -------
-    centers : array of shape(k, n_features)
-    """
-    random_state = check_random_state(random_state)
-    n_samples = X.shape[0]
-
-    if x_squared_norms is None:
-        x_squared_norms = row_norms(X, squared=True)
-
-    if init_size is not None and init_size < n_samples:
-        if init_size < n_clusters:
-            warnings.warn(
-                "init_size=%d should be larger than k=%d. "
-                "Setting it to 3*k" % (init_size, n_clusters),
-                RuntimeWarning, stacklevel=2)
-            init_size = 3 * n_clusters
-        init_indices = random_state.randint(0, n_samples, init_size)
-        X = X[init_indices]
-        x_squared_norms = x_squared_norms[init_indices]
-        n_samples = X.shape[0]
-    elif n_samples < n_clusters:
-        raise ValueError(
-            "n_samples={} should be larger than n_clusters={}"
-            .format(n_samples, n_clusters))
-
-    if isinstance(init, str) and init == 'k-means++':
-        centers = _k_init(X, n_clusters, random_state=random_state,
-                          x_squared_norms=x_squared_norms)
-    elif isinstance(init, str) and init == 'random':
-        seeds = random_state.permutation(n_samples)[:n_clusters]
-        centers = X[seeds]
-    elif hasattr(init, '__array__'):
-        # ensure that the centers have the same dtype as X
-        # this is a requirement of fused types of cython
-        centers = np.array(init, dtype=X.dtype)
-    elif callable(init):
-        centers = init(X, n_clusters, random_state=random_state)
-        centers = np.asarray(centers, dtype=X.dtype)
-    else:
-        raise ValueError("the init parameter for the k-means should "
-                         "be 'k-means++' or 'random' or an ndarray, "
-                         "'%s' (type '%s') was passed." % (init, type(init)))
-
-    if sp.issparse(centers):
-        centers = centers.toarray()
-
-    _validate_center_shape(X, n_clusters, centers)
-    return centers
-
-
 class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     """K-Means clustering.
 
@@ -972,6 +819,17 @@ def _check_params(self, X):
                 f"n_init={self._n_init}.", RuntimeWarning, stacklevel=2)
             self._n_init = 1
 
+    def _validate_center_shape(self, X, centers):
+        """Check if centers is compatible with X and n_clusters"""
+        if centers.shape[0] != self.n_clusters:
+            raise ValueError(
+                f"The shape of the initial centers {centers.shape} does not "
+                f"match the number of clusters {self.n_clusters}.")
+        if centers.shape[1] != X.shape[1]:
+            raise ValueError(
+                f"The shape of the initial centers {centers.shape} does not "
+                f"match the number of features of the data {X.shape[1]}.")
+
     def _check_test_data(self, X):
         X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
                         order='C', accept_large_sparse=False)
@@ -984,6 +842,62 @@ def _check_test_data(self, X):
 
         return X
 
+    def _init_centroids(self, X, x_squared_norms, init, random_state,
+                        init_size=None):
+        """Compute the initial centroids
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        x_squared_norms : ndarray of shape (n_samples,)
+            Squared euclidean norm of each data point. Pass it if you have it
+            at hands already to avoid it being recomputed here.
+
+        init : {'k-means++', 'random', ndarray, callable}
+            Method for initialization.
+
+        random_state : RandomState instance
+            Determines random number generation for centroid initialization.
+            See :term:`Glossary <random_state>`.
+
+        init_size : int, default=None
+            Number of samples to randomly sample for speeding up the
+            initialization (sometimes at the expense of accuracy).
+
+        Returns
+        -------
+        centers : ndarray of shape(n_clusters, n_features)
+        """
+        n_samples = X.shape[0]
+        n_clusters = self.n_clusters
+
+        if init_size is not None and init_size < n_samples:
+            init_indices = random_state.randint(0, n_samples, init_size)
+            X = X[init_indices]
+            x_squared_norms = x_squared_norms[init_indices]
+            n_samples = X.shape[0]
+
+        if isinstance(init, str) and init == 'k-means++':
+            centers = _k_init(X, n_clusters, random_state=random_state,
+                              x_squared_norms=x_squared_norms)
+        elif isinstance(init, str) and init == 'random':
+            seeds = random_state.permutation(n_samples)[:n_clusters]
+            centers = X[seeds]
+        elif hasattr(init, '__array__'):
+            centers = init
+        elif callable(init):
+            centers = init(X, n_clusters, random_state=random_state)
+            centers = check_array(
+                centers, dtype=X.dtype, copy=False, order='C')
+            self._validate_center_shape(X, centers)
+
+        if sp.issparse(centers):
+            centers = centers.toarray()
+
+        return centers
+
     def fit(self, X, y=None, sample_weight=None):
         """Compute k-means clustering.
 
@@ -1017,12 +931,13 @@ def fit(self, X, y=None, sample_weight=None):
 
         self._check_params(X)
         random_state = check_random_state(self.random_state)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         # Validate init array
         init = self.init
         if hasattr(init, '__array__'):
             init = check_array(init, dtype=X.dtype, copy=True, order='C')
-            _validate_center_shape(X, self.n_clusters, init)
+            self._validate_center_shape(X, init)
 
         # subtract of mean of x for more accurate distance computations
         if not sp.issparse(X):
@@ -1041,22 +956,26 @@ def fit(self, X, y=None, sample_weight=None):
         else:
             kmeans_single = _kmeans_single_elkan
 
-        best_labels, best_inertia, best_centers = None, None, None
+        best_inertia = None
 
-        # seeds for the initializations of the kmeans runs.
-        seeds = random_state.randint(np.iinfo(np.int32).max, size=self._n_init)
+        for i in range(self._n_init):
+            # Initialize centers
+            centers_init = self._init_centroids(
+                X, x_squared_norms=x_squared_norms, init=init,
+                random_state=random_state)
+            if self.verbose:
+                print("Initialization complete")
 
-        for seed in seeds:
             # run a k-means once
             labels, inertia, centers, n_iter_ = kmeans_single(
-                X, sample_weight, self.n_clusters, max_iter=self.max_iter,
-                init=init, verbose=self.verbose, tol=self._tol,
-                x_squared_norms=x_squared_norms, random_state=seed,
-                n_threads=self._n_threads)
+                X, sample_weight, centers_init, max_iter=self.max_iter,
+                verbose=self.verbose, tol=self._tol,
+                x_squared_norms=x_squared_norms, n_threads=self._n_threads)
+
             # determine if these results are the best so far
             if best_inertia is None or inertia < best_inertia:
-                best_labels = labels.copy()
-                best_centers = centers.copy()
+                best_labels = labels
+                best_centers = centers
                 best_inertia = inertia
                 best_n_iter = n_iter_
 
@@ -1183,6 +1102,7 @@ def predict(self, X, sample_weight=None):
 
         X = self._check_test_data(X)
         x_squared_norms = row_norms(X, squared=True)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         return _labels_inertia(X, sample_weight, x_squared_norms,
                                self.cluster_centers_, self._n_threads)[0]
@@ -1211,6 +1131,7 @@ def score(self, X, y=None, sample_weight=None):
 
         X = self._check_test_data(X)
         x_squared_norms = row_norms(X, squared=True)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         return -_labels_inertia(X, sample_weight, x_squared_norms,
                                 self.cluster_centers_)[1]
@@ -1695,7 +1616,7 @@ def fit(self, X, y=None, sample_weight=None):
         init = self.init
         if hasattr(init, '__array__'):
             init = check_array(init, dtype=X.dtype, copy=True, order='C')
-            _validate_center_shape(X, self.n_clusters, init)
+            self._validate_center_shape(X, init)
 
         n_samples, n_features = X.shape
         x_squared_norms = row_norms(X, squared=True)
@@ -1736,10 +1657,10 @@ def fit(self, X, y=None, sample_weight=None):
 
             # Initialize the centers using only a fraction of the data as we
             # expect n_samples to be very large when using MiniBatchKMeans
-            cluster_centers = _init_centroids(
-                X, self.n_clusters, init,
+            cluster_centers = self._init_centroids(
+                X, x_squared_norms=x_squared_norms,
+                init=init,
                 random_state=random_state,
-                x_squared_norms=x_squared_norms,
                 init_size=self._init_size)
 
             # Compute the label assignment on the init dataset
@@ -1877,13 +1798,14 @@ def partial_fit(self, X, y=None, sample_weight=None):
             init = self.init
             if hasattr(init, '__array__'):
                 init = check_array(init, dtype=X.dtype, copy=True, order='C')
-                _validate_center_shape(X, self.n_clusters, init)
+                self._validate_center_shape(X, init)
 
             # initialize the cluster centers
-            self.cluster_centers_ = _init_centroids(
-                X, self.n_clusters, init,
+            self.cluster_centers_ = self._init_centroids(
+                X, x_squared_norms=x_squared_norms,
+                init=init,
                 random_state=self._random_state,
-                x_squared_norms=x_squared_norms, init_size=self.init_size)
+                init_size=self._init_size)
 
             self._counts = np.zeros(self.n_clusters,
                                     dtype=sample_weight.dtype)
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 5d3829610f203..95b46ae11a787 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -713,18 +713,6 @@ def test_k_means_function():
                  sample_weight=None, init=centers)
 
 
-def test_x_squared_norms_init_centroids():
-    # Test that x_squared_norms can be None in _init_centroids
-    from sklearn.cluster._kmeans import _init_centroids
-
-    X_norms = np.sum(X**2, axis=1)
-    precompute = _init_centroids(
-        X, 3, "k-means++", random_state=0, x_squared_norms=X_norms)
-    assert_array_almost_equal(
-        precompute,
-        _init_centroids(X, 3, "k-means++", random_state=0))
-
-
 @pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
 @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
 def test_float_precision(Estimator, data):

From 3e009d957da923ecd37884f86485d4ae5cf29668 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 16 Jul 2020 11:18:14 +0200
Subject: [PATCH 2/2] address comments

---
 sklearn/cluster/_kmeans.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 9ed07ec752fe3..a42a3b309a57d 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -179,7 +179,8 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
         The weights for each observation in X. If None, all observations
         are assigned equal weight
 
-    init : {'k-means++', 'random', ndarray, callable}, default='k-means++'
+    init : {'k-means++', 'random'}, callable or array-like of shape \
+            (n_clusters, n_features), default='k-means++'
         Method for initialization:
 
         'k-means++' : selects initial cluster centers for k-mean
@@ -189,7 +190,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
         'random': choose `n_clusters` observations (rows) at random from data
         for the initial centroids.
 
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        If an array is passed, it should be of shape (n_clusters, n_features)
         and gives the initial centers.
 
         If a callable is passed, it should take arguments X, n_clusters and a
@@ -435,7 +436,7 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
     verbose : bool, default=False
         Verbosity mode
 
-    x_squared_norms : ndarray of shape(n_samples,), default=None
+    x_squared_norms : ndarray of shape (n_samples,), default=None
         Precomputed x_squared_norms.
 
     tol : float, default=1e-4
@@ -586,7 +587,8 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    init : {'k-means++', 'random', ndarray, callable}, default='k-means++'
+    init : {'k-means++', 'random'}, callable or array-like of shape \
+            (n_clusters, n_features), default='k-means++'
         Method for initialization:
 
         'k-means++' : selects initial cluster centers for k-mean
@@ -596,7 +598,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         'random': choose `n_clusters` observations (rows) at random from data
         for the initial centroids.
 
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        If an array is passed, it should be of shape (n_clusters, n_features)
         and gives the initial centers.
 
         If a callable is passed, it should take arguments X, n_clusters and a
@@ -855,7 +857,8 @@ def _init_centroids(self, X, x_squared_norms, init, random_state,
             Squared euclidean norm of each data point. Pass it if you have it
             at hands already to avoid it being recomputed here.
 
-        init : {'k-means++', 'random', ndarray, callable}
+        init : {'k-means++', 'random'}, callable or ndarray of shape \
+                (n_clusters, n_features)
             Method for initialization.
 
         random_state : RandomState instance
@@ -868,7 +871,7 @@ def _init_centroids(self, X, x_squared_norms, init, random_state,
 
         Returns
         -------
-        centers : ndarray of shape(n_clusters, n_features)
+        centers : ndarray of shape (n_clusters, n_features)
         """
         n_samples = X.shape[0]
         n_clusters = self.n_clusters
@@ -1367,20 +1370,23 @@ class MiniBatchKMeans(KMeans):
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    init : {'k-means++', 'random'} or ndarray of shape \
+    init : {'k-means++', 'random'}, callable or array-like of shape \
             (n_clusters, n_features), default='k-means++'
-        Method for initialization
+        Method for initialization:
 
         'k-means++' : selects initial cluster centers for k-mean
         clustering in a smart way to speed up convergence. See section
         Notes in k_init for more details.
 
-        'random': choose k observations (rows) at random from data for
-        the initial centroids.
+        'random': choose `n_clusters` observations (rows) at random from data
+        for the initial centroids.
 
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        If an array is passed, it should be of shape (n_clusters, n_features)
         and gives the initial centers.
 
+        If a callable is passed, it should take arguments X, n_clusters and a
+        random state and return an initialization.
+
     max_iter : int, default=100
         Maximum number of iterations over the complete dataset before
         stopping independently of any early stopping criterion heuristics.