Closed
Description
KMeans fit_predict(X) != fit(X).predict(X)
in several cases in
https://travis-ci.org/MacPython/scikit-learn-wheels/jobs/458223437
https://travis-ci.org/MacPython/scikit-learn-wheels/jobs/458223439
_________ test_k_means_fit_predict[0-2-1e-07-csr_matrix-float64-full] __________
algo = 'full', dtype = <class 'numpy.float64'>
constructor = <class 'scipy.sparse.csr.csr_matrix'>, seed = 0, max_iter = 2
tol = 1e-07
@pytest.mark.parametrize('algo', ['full', 'elkan'])
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
@pytest.mark.parametrize('constructor', [np.asarray, sp.csr_matrix])
@pytest.mark.parametrize('seed, max_iter, tol', [
(0, 2, 1e-7), # strict non-convergence
(1, 2, 1e-1), # loose non-convergence
(3, 300, 1e-7), # strict convergence
(4, 300, 1e-1), # loose convergence
])
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
# check that fit.predict gives same result as fit_predict
# There's a very small chance of failure with elkan on unstructured dataset
# because predict method uses fast euclidean distances computation which
# may cause small numerical instabilities.
if not (algo == 'elkan' and constructor is sp.csr_matrix):
rng = np.random.RandomState(seed)
X = make_blobs(n_samples=1000, n_features=10, centers=10,
random_state=rng)[0].astype(dtype, copy=False)
X = constructor(X)
kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
tol=tol, max_iter=max_iter, n_jobs=1)
labels_1 = kmeans.fit(X).predict(X)
labels_2 = kmeans.fit_predict(X)
> assert_array_equal(labels_1, labels_2)
E AssertionError:
E Arrays are not equal
E
E (mismatch 80.0%)
E x: array([0, 5, 4, 1, 1, 2, 1, 2, 7, 4, 0, 3, 8, 9, 8, 3, 9, 4, 0, 5, 1, 1, 0,
E 2, 5, 5, 9, 3, 2, 5, 7, 4, 1, 5, 0, 2, 0, 9, 1, 9, 4, 3, 1, 5, 4, 1,
E 6, 3, 5, 9, 3, 9, 5, 4, 8, 2, 2, 0, 5, 7, 3, 7, 4, 9, 8, 6, 9, 0, 6,...
E y: array([1, 0, 5, 2, 2, 3, 2, 3, 7, 5, 1, 4, 9, 8, 9, 4, 8, 5, 1, 0, 2, 2, 1,
E 3, 0, 0, 8, 4, 3, 0, 7, 5, 2, 0, 1, 3, 1, 8, 2, 8, 5, 4, 2, 0, 5, 2,
E 6, 4, 0, 8, 4, 8, 0, 5, 9, 3, 3, 1, 0, 7, 4, 7, 5, 8, 9, 6, 8, 1, 6,...
X = <1000x10 sparse matrix of type '<class 'numpy.float64'>'
with 10000 stored elements in Compressed Sparse Row format>
algo = 'full'
constructor = <class 'scipy.sparse.csr.csr_matrix'>
dtype = <class 'numpy.float64'>
kmeans = KMeans(algorithm='full', copy_x=True, init='k-means++', max_iter=2,
n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
random_state=0, tol=1e-07, verbose=0)
labels_1 = array([0, 5, 4, 1, 1, 2, 1, 2, 7, 4, 0, 3, 8, 9, 8, 3, 9, 4, 0, 5, 1, 1, 0,
2, 5, 5, 9, 3, 2, 5, 7, 4, 1, 5, 0,...3, 5, 1, 3, 3, 2, 3, 5, 4, 8, 8, 0, 8, 1, 7, 3, 6, 2, 2, 6, 3, 3,
3, 3, 8, 3, 7, 9, 8, 9, 5, 4, 2], dtype=int32)
labels_2 = array([1, 0, 5, 2, 2, 3, 2, 3, 7, 5, 1, 4, 9, 8, 9, 4, 8, 5, 1, 0, 2, 2, 1,
3, 0, 0, 8, 4, 3, 0, 7, 5, 2, 0, 1,...4, 0, 2, 4, 4, 3, 4, 0, 5, 9, 9, 1, 9, 2, 7, 4, 6, 3, 3, 6, 4, 4,
4, 4, 9, 4, 7, 8, 9, 8, 0, 5, 3], dtype=int32)
max_iter = 2
rng = <mtrand.RandomState object at 0x114933ea0>
seed = 0
tol = 1e-07
../venv/lib/python3.6/site-packages/sklearn/cluster/tests/test_k_means.py:352: AssertionError
_________ test_k_means_fit_predict[4-300-0.1-csr_matrix-float64-full] __________
algo = 'full', dtype = <class 'numpy.float64'>
constructor = <class 'scipy.sparse.csr.csr_matrix'>, seed = 4, max_iter = 300
tol = 0.1
@pytest.mark.parametrize('algo', ['full', 'elkan'])
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
@pytest.mark.parametrize('constructor', [np.asarray, sp.csr_matrix])
@pytest.mark.parametrize('seed, max_iter, tol', [
(0, 2, 1e-7), # strict non-convergence
(1, 2, 1e-1), # loose non-convergence
(3, 300, 1e-7), # strict convergence
(4, 300, 1e-1), # loose convergence
])
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
# check that fit.predict gives same result as fit_predict
# There's a very small chance of failure with elkan on unstructured dataset
# because predict method uses fast euclidean distances computation which
# may cause small numerical instabilities.
if not (algo == 'elkan' and constructor is sp.csr_matrix):
rng = np.random.RandomState(seed)
X = make_blobs(n_samples=1000, n_features=10, centers=10,
random_state=rng)[0].astype(dtype, copy=False)
X = constructor(X)
kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
tol=tol, max_iter=max_iter, n_jobs=1)
labels_1 = kmeans.fit(X).predict(X)
labels_2 = kmeans.fit_predict(X)
> assert_array_equal(labels_1, labels_2)
E AssertionError:
E Arrays are not equal
E
E (mismatch 100.0%)
E x: array([7, 0, 7, 8, 8, 7, 8, 9, 2, 3, 6, 8, 9, 6, 9, 5, 1, 7, 0, 3, 9, 0, 6,
E 3, 5, 5, 6, 3, 6, 1, 4, 7, 0, 4, 0, 6, 4, 6, 0, 4, 4, 9, 6, 1, 2, 0,
E 2, 5, 1, 4, 9, 1, 5, 3, 9, 6, 6, 9, 9, 8, 7, 1, 6, 2, 7, 0, 9, 1, 3,...
E y: array([9, 2, 9, 0, 0, 9, 0, 1, 8, 7, 4, 0, 1, 4, 1, 6, 5, 9, 2, 7, 1, 2, 4,
E 7, 6, 6, 4, 7, 4, 5, 3, 9, 2, 3, 2, 4, 3, 4, 2, 3, 3, 1, 4, 5, 8, 2,
E 8, 6, 5, 3, 1, 5, 6, 7, 1, 4, 4, 1, 1, 0, 9, 5, 4, 8, 9, 2, 1, 5, 7,...
X = <1000x10 sparse matrix of type '<class 'numpy.float64'>'
with 10000 stored elements in Compressed Sparse Row format>
algo = 'full'
constructor = <class 'scipy.sparse.csr.csr_matrix'>
dtype = <class 'numpy.float64'>
kmeans = KMeans(algorithm='full', copy_x=True, init='k-means++', max_iter=300,
n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
random_state=4, tol=0.1, verbose=0)
labels_1 = array([7, 0, 7, 8, 8, 7, 8, 9, 2, 3, 6, 8, 9, 6, 9, 5, 1, 7, 0, 3, 9, 0, 6,
3, 5, 5, 6, 3, 6, 1, 4, 7, 0, 4, 0,...0, 0, 8, 5, 8, 2, 4, 7, 3, 3, 6, 8, 5, 7, 1, 2, 7, 1, 4, 9, 9, 5,
4, 2, 2, 7, 5, 9, 8, 4, 9, 0, 1], dtype=int32)
labels_2 = array([9, 2, 9, 0, 0, 9, 0, 1, 8, 7, 4, 0, 1, 4, 1, 6, 5, 9, 2, 7, 1, 2, 4,
7, 6, 6, 4, 7, 4, 5, 3, 9, 2, 3, 2,...2, 2, 0, 6, 0, 8, 3, 9, 7, 7, 4, 0, 6, 9, 5, 8, 9, 5, 3, 1, 1, 6,
3, 8, 8, 9, 6, 1, 0, 3, 1, 2, 5], dtype=int32)
max_iter = 300
rng = <mtrand.RandomState object at 0x1141c9708>
seed = 4
tol = 0.1
../venv/lib/python3.6/site-packages/sklearn/cluster/tests/test_k_means.py:352: AssertionError