|
47 | 47 | ###############################################################################
|
48 | 48 | # Initialization heuristic
|
49 | 49 |
|
| 50 | +def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None, |
| 51 | + random_state=None, n_local_trials=None): |
| 52 | + """Init n_clusters seeds according to k-means++ |
| 53 | +
|
| 54 | + .. versionadded:: 0.24 |
| 55 | +
|
| 56 | + Parameters |
| 57 | + ---------- |
| 58 | + X : {array-like, sparse matrix} of shape (n_samples, n_features) |
| 59 | + The data to pick seeds from. |
| 60 | +
|
| 61 | + n_clusters : int |
| 62 | + The number of centroids to initialize |
| 63 | +
|
| 64 | + x_squared_norms : array-like of shape (n_samples,), default=None |
| 65 | + Squared Euclidean norm of each data point. |
| 66 | +
|
| 67 | + random_state : int or RandomState instance, default=None |
| 68 | + Determines random number generation for centroid initialization. Pass |
| 69 | + an int for reproducible output across multiple function calls. |
| 70 | + See :term:`Glossary <random_state>`. |
| 71 | +
|
| 72 | + n_local_trials : int, default=None |
| 73 | + The number of seeding trials for each center (except the first), |
| 74 | + of which the one reducing inertia the most is greedily chosen. |
| 75 | + Set to None to make the number of trials depend logarithmically |
| 76 | + on the number of seeds (2+log(k)). |
| 77 | +
|
| 78 | + Returns |
| 79 | + ------- |
| 80 | + centers : ndarray of shape (n_clusters, n_features) |
| 81 | + The inital centers for k-means. |
| 82 | +
|
| 83 | + indices : ndarray of shape (n_clusters,) |
| 84 | + The index location of the chosen centers in the data array X. For a |
| 85 | + given index and center, X[index] = center. |
| 86 | +
|
| 87 | + Notes |
| 88 | + ----- |
| 89 | + Selects initial cluster centers for k-mean clustering in a smart way |
| 90 | + to speed up convergence. see: Arthur, D. and Vassilvitskii, S. |
| 91 | + "k-means++: the advantages of careful seeding". ACM-SIAM symposium |
| 92 | + on Discrete algorithms. 2007 |
| 93 | +
|
| 94 | + Examples |
| 95 | + -------- |
| 96 | +
|
| 97 | + >>> from sklearn.cluster import kmeans_plusplus |
| 98 | + >>> import numpy as np |
| 99 | + >>> X = np.array([[1, 2], [1, 4], [1, 0], |
| 100 | + ... [10, 2], [10, 4], [10, 0]]) |
| 101 | + >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0) |
| 102 | + >>> centers |
| 103 | + array([[10, 4], |
| 104 | + [ 1, 0]]) |
| 105 | + >>> indices |
| 106 | + array([4, 2]) |
| 107 | + """ |
| 108 | + |
| 109 | + # Check data |
| 110 | + check_array(X, accept_sparse='csr', |
| 111 | + dtype=[np.float64, np.float32]) |
| 112 | + |
| 113 | + if X.shape[0] < n_clusters: |
| 114 | + raise ValueError(f"n_samples={X.shape[0]} should be >= " |
| 115 | + f"n_clusters={n_clusters}.") |
| 116 | + |
| 117 | + # Check parameters |
| 118 | + if x_squared_norms is None: |
| 119 | + x_squared_norms = row_norms(X, squared=True) |
| 120 | + else: |
| 121 | + x_squared_norms = check_array(x_squared_norms, |
| 122 | + dtype=X.dtype, |
| 123 | + ensure_2d=False) |
| 124 | + |
| 125 | + if x_squared_norms.shape[0] != X.shape[0]: |
| 126 | + raise ValueError( |
| 127 | + f"The length of x_squared_norms {x_squared_norms.shape[0]} should " |
| 128 | + f"be equal to the length of n_samples {X.shape[0]}.") |
| 129 | + |
| 130 | + if n_local_trials is not None and n_local_trials < 1: |
| 131 | + raise ValueError( |
| 132 | + f"n_local_trials is set to {n_local_trials} but should be an " |
| 133 | + f"integer value greater than zero.") |
| 134 | + |
| 135 | + random_state = check_random_state(random_state) |
| 136 | + |
| 137 | + # Call private k-means++ |
| 138 | + centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms, |
| 139 | + random_state, n_local_trials) |
| 140 | + |
| 141 | + return centers, indices |
| 142 | + |
50 | 143 |
|
51 | 144 | def _kmeans_plusplus(X, n_clusters, x_squared_norms,
|
52 | 145 | random_state, n_local_trials=None):
|
@@ -1924,97 +2017,3 @@ def _more_tags(self):
|
1924 | 2017 | 'zero sample_weight is not equivalent to removing samples',
|
1925 | 2018 | }
|
1926 | 2019 | }
|
1927 |
| - |
1928 |
| - |
1929 |
| -def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None, |
1930 |
| - random_state=None, n_local_trials=None): |
1931 |
| - """Init n_clusters seeds according to k-means++ |
1932 |
| -
|
1933 |
| - .. versionadded:: 0.24 |
1934 |
| -
|
1935 |
| - Parameters |
1936 |
| - ---------- |
1937 |
| - X : {array-like, sparse matrix} of shape (n_samples, n_features) |
1938 |
| - The data to pick seeds from. |
1939 |
| -
|
1940 |
| - n_clusters : int |
1941 |
| - The number of centroids to initialize |
1942 |
| -
|
1943 |
| - x_squared_norms : array-like of shape (n_samples,), default=None |
1944 |
| - Squared Euclidean norm of each data point. |
1945 |
| -
|
1946 |
| - random_state : int or RandomState instance, default=None |
1947 |
| - Determines random number generation for centroid initialization. Pass |
1948 |
| - an int for reproducible output across multiple function calls. |
1949 |
| - See :term:`Glossary <random_state>`. |
1950 |
| -
|
1951 |
| - n_local_trials : int, default=None |
1952 |
| - The number of seeding trials for each center (except the first), |
1953 |
| - of which the one reducing inertia the most is greedily chosen. |
1954 |
| - Set to None to make the number of trials depend logarithmically |
1955 |
| - on the number of seeds (2+log(k)). |
1956 |
| -
|
1957 |
| - Returns |
1958 |
| - ------- |
1959 |
| - centers : ndarray of shape (n_clusters, n_features) |
1960 |
| - The inital centers for k-means. |
1961 |
| -
|
1962 |
| - indices : ndarray of shape (n_clusters,) |
1963 |
| - The index location of the chosen centers in the data array X. For a |
1964 |
| - given index and center, X[index] = center. |
1965 |
| -
|
1966 |
| - Notes |
1967 |
| - ----- |
1968 |
| - Selects initial cluster centers for k-mean clustering in a smart way |
1969 |
| - to speed up convergence. see: Arthur, D. and Vassilvitskii, S. |
1970 |
| - "k-means++: the advantages of careful seeding". ACM-SIAM symposium |
1971 |
| - on Discrete algorithms. 2007 |
1972 |
| -
|
1973 |
| - Examples |
1974 |
| - -------- |
1975 |
| -
|
1976 |
| - >>> from sklearn.cluster import kmeans_plusplus |
1977 |
| - >>> import numpy as np |
1978 |
| - >>> X = np.array([[1, 2], [1, 4], [1, 0], |
1979 |
| - ... [10, 2], [10, 4], [10, 0]]) |
1980 |
| - >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0) |
1981 |
| - >>> centers |
1982 |
| - array([[10, 4], |
1983 |
| - [ 1, 0]]) |
1984 |
| - >>> indices |
1985 |
| - array([4, 2]) |
1986 |
| - """ |
1987 |
| - |
1988 |
| - # Check data |
1989 |
| - check_array(X, accept_sparse='csr', |
1990 |
| - dtype=[np.float64, np.float32]) |
1991 |
| - |
1992 |
| - if X.shape[0] < n_clusters: |
1993 |
| - raise ValueError(f"n_samples={X.shape[0]} should be >= " |
1994 |
| - f"n_clusters={n_clusters}.") |
1995 |
| - |
1996 |
| - # Check parameters |
1997 |
| - if x_squared_norms is None: |
1998 |
| - x_squared_norms = row_norms(X, squared=True) |
1999 |
| - else: |
2000 |
| - x_squared_norms = check_array(x_squared_norms, |
2001 |
| - dtype=X.dtype, |
2002 |
| - ensure_2d=False) |
2003 |
| - |
2004 |
| - if x_squared_norms.shape[0] != X.shape[0]: |
2005 |
| - raise ValueError( |
2006 |
| - f"The length of x_squared_norms {x_squared_norms.shape[0]} should " |
2007 |
| - f"be equal to the length of n_samples {X.shape[0]}.") |
2008 |
| - |
2009 |
| - if n_local_trials is not None and n_local_trials < 1: |
2010 |
| - raise ValueError( |
2011 |
| - f"n_local_trials is set to {n_local_trials} but should be an " |
2012 |
| - f"integer value greater than zero.") |
2013 |
| - |
2014 |
| - random_state = check_random_state(random_state) |
2015 |
| - |
2016 |
| - # Call private k-means++ |
2017 |
| - centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms, |
2018 |
| - random_state, n_local_trials) |
2019 |
| - |
2020 |
| - return centers, indices |
0 commit comments