Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 579e7de

Browse files
authored
move kmpp public next to kmpp private (#19666)
1 parent 5ccfabf commit 579e7de

File tree

1 file changed

+93
-94
lines changed

1 file changed

+93
-94
lines changed

sklearn/cluster/_kmeans.py

Lines changed: 93 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,99 @@
4747
###############################################################################
4848
# Initialization heuristic
4949

50+
def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None,
51+
random_state=None, n_local_trials=None):
52+
"""Init n_clusters seeds according to k-means++
53+
54+
.. versionadded:: 0.24
55+
56+
Parameters
57+
----------
58+
X : {array-like, sparse matrix} of shape (n_samples, n_features)
59+
The data to pick seeds from.
60+
61+
n_clusters : int
62+
The number of centroids to initialize
63+
64+
x_squared_norms : array-like of shape (n_samples,), default=None
65+
Squared Euclidean norm of each data point.
66+
67+
random_state : int or RandomState instance, default=None
68+
Determines random number generation for centroid initialization. Pass
69+
an int for reproducible output across multiple function calls.
70+
See :term:`Glossary <random_state>`.
71+
72+
n_local_trials : int, default=None
73+
The number of seeding trials for each center (except the first),
74+
of which the one reducing inertia the most is greedily chosen.
75+
Set to None to make the number of trials depend logarithmically
76+
on the number of seeds (2+log(k)).
77+
78+
Returns
79+
-------
80+
centers : ndarray of shape (n_clusters, n_features)
81+
The inital centers for k-means.
82+
83+
indices : ndarray of shape (n_clusters,)
84+
The index location of the chosen centers in the data array X. For a
85+
given index and center, X[index] = center.
86+
87+
Notes
88+
-----
89+
Selects initial cluster centers for k-mean clustering in a smart way
90+
to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
91+
"k-means++: the advantages of careful seeding". ACM-SIAM symposium
92+
on Discrete algorithms. 2007
93+
94+
Examples
95+
--------
96+
97+
>>> from sklearn.cluster import kmeans_plusplus
98+
>>> import numpy as np
99+
>>> X = np.array([[1, 2], [1, 4], [1, 0],
100+
... [10, 2], [10, 4], [10, 0]])
101+
>>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)
102+
>>> centers
103+
array([[10, 4],
104+
[ 1, 0]])
105+
>>> indices
106+
array([4, 2])
107+
"""
108+
109+
# Check data
110+
check_array(X, accept_sparse='csr',
111+
dtype=[np.float64, np.float32])
112+
113+
if X.shape[0] < n_clusters:
114+
raise ValueError(f"n_samples={X.shape[0]} should be >= "
115+
f"n_clusters={n_clusters}.")
116+
117+
# Check parameters
118+
if x_squared_norms is None:
119+
x_squared_norms = row_norms(X, squared=True)
120+
else:
121+
x_squared_norms = check_array(x_squared_norms,
122+
dtype=X.dtype,
123+
ensure_2d=False)
124+
125+
if x_squared_norms.shape[0] != X.shape[0]:
126+
raise ValueError(
127+
f"The length of x_squared_norms {x_squared_norms.shape[0]} should "
128+
f"be equal to the length of n_samples {X.shape[0]}.")
129+
130+
if n_local_trials is not None and n_local_trials < 1:
131+
raise ValueError(
132+
f"n_local_trials is set to {n_local_trials} but should be an "
133+
f"integer value greater than zero.")
134+
135+
random_state = check_random_state(random_state)
136+
137+
# Call private k-means++
138+
centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms,
139+
random_state, n_local_trials)
140+
141+
return centers, indices
142+
50143

51144
def _kmeans_plusplus(X, n_clusters, x_squared_norms,
52145
random_state, n_local_trials=None):
@@ -1924,97 +2017,3 @@ def _more_tags(self):
19242017
'zero sample_weight is not equivalent to removing samples',
19252018
}
19262019
}
1927-
1928-
1929-
def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None,
1930-
random_state=None, n_local_trials=None):
1931-
"""Init n_clusters seeds according to k-means++
1932-
1933-
.. versionadded:: 0.24
1934-
1935-
Parameters
1936-
----------
1937-
X : {array-like, sparse matrix} of shape (n_samples, n_features)
1938-
The data to pick seeds from.
1939-
1940-
n_clusters : int
1941-
The number of centroids to initialize
1942-
1943-
x_squared_norms : array-like of shape (n_samples,), default=None
1944-
Squared Euclidean norm of each data point.
1945-
1946-
random_state : int or RandomState instance, default=None
1947-
Determines random number generation for centroid initialization. Pass
1948-
an int for reproducible output across multiple function calls.
1949-
See :term:`Glossary <random_state>`.
1950-
1951-
n_local_trials : int, default=None
1952-
The number of seeding trials for each center (except the first),
1953-
of which the one reducing inertia the most is greedily chosen.
1954-
Set to None to make the number of trials depend logarithmically
1955-
on the number of seeds (2+log(k)).
1956-
1957-
Returns
1958-
-------
1959-
centers : ndarray of shape (n_clusters, n_features)
1960-
The inital centers for k-means.
1961-
1962-
indices : ndarray of shape (n_clusters,)
1963-
The index location of the chosen centers in the data array X. For a
1964-
given index and center, X[index] = center.
1965-
1966-
Notes
1967-
-----
1968-
Selects initial cluster centers for k-mean clustering in a smart way
1969-
to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
1970-
"k-means++: the advantages of careful seeding". ACM-SIAM symposium
1971-
on Discrete algorithms. 2007
1972-
1973-
Examples
1974-
--------
1975-
1976-
>>> from sklearn.cluster import kmeans_plusplus
1977-
>>> import numpy as np
1978-
>>> X = np.array([[1, 2], [1, 4], [1, 0],
1979-
... [10, 2], [10, 4], [10, 0]])
1980-
>>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)
1981-
>>> centers
1982-
array([[10, 4],
1983-
[ 1, 0]])
1984-
>>> indices
1985-
array([4, 2])
1986-
"""
1987-
1988-
# Check data
1989-
check_array(X, accept_sparse='csr',
1990-
dtype=[np.float64, np.float32])
1991-
1992-
if X.shape[0] < n_clusters:
1993-
raise ValueError(f"n_samples={X.shape[0]} should be >= "
1994-
f"n_clusters={n_clusters}.")
1995-
1996-
# Check parameters
1997-
if x_squared_norms is None:
1998-
x_squared_norms = row_norms(X, squared=True)
1999-
else:
2000-
x_squared_norms = check_array(x_squared_norms,
2001-
dtype=X.dtype,
2002-
ensure_2d=False)
2003-
2004-
if x_squared_norms.shape[0] != X.shape[0]:
2005-
raise ValueError(
2006-
f"The length of x_squared_norms {x_squared_norms.shape[0]} should "
2007-
f"be equal to the length of n_samples {X.shape[0]}.")
2008-
2009-
if n_local_trials is not None and n_local_trials < 1:
2010-
raise ValueError(
2011-
f"n_local_trials is set to {n_local_trials} but should be an "
2012-
f"integer value greater than zero.")
2013-
2014-
random_state = check_random_state(random_state)
2015-
2016-
# Call private k-means++
2017-
centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms,
2018-
random_state, n_local_trials)
2019-
2020-
return centers, indices

0 commit comments

Comments
 (0)