Thanks to visit codestin.com
Credit goes to github.com

Skip to content

test_k_means_fit_predict failing on some MacPython runs #12644

Closed
@jnothman

Description

@jnothman

KMeans fit_predict(X) != fit(X).predict(X) in several cases in

https://travis-ci.org/MacPython/scikit-learn-wheels/jobs/458223437
https://travis-ci.org/MacPython/scikit-learn-wheels/jobs/458223439

_________ test_k_means_fit_predict[0-2-1e-07-csr_matrix-float64-full] __________
algo = 'full', dtype = <class 'numpy.float64'>
constructor = <class 'scipy.sparse.csr.csr_matrix'>, seed = 0, max_iter = 2
tol = 1e-07
    @pytest.mark.parametrize('algo', ['full', 'elkan'])
    @pytest.mark.parametrize('dtype', [np.float32, np.float64])
    @pytest.mark.parametrize('constructor', [np.asarray, sp.csr_matrix])
    @pytest.mark.parametrize('seed, max_iter, tol', [
        (0, 2, 1e-7),    # strict non-convergence
        (1, 2, 1e-1),    # loose non-convergence
        (3, 300, 1e-7),  # strict convergence
        (4, 300, 1e-1),  # loose convergence
    ])
    def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
        # check that fit.predict gives same result as fit_predict
        # There's a very small chance of failure with elkan on unstructured dataset
        # because predict method uses fast euclidean distances computation which
        # may cause small numerical instabilities.
        if not (algo == 'elkan' and constructor is sp.csr_matrix):
            rng = np.random.RandomState(seed)
    
            X = make_blobs(n_samples=1000, n_features=10, centers=10,
                           random_state=rng)[0].astype(dtype, copy=False)
            X = constructor(X)
    
            kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
                            tol=tol, max_iter=max_iter, n_jobs=1)
    
            labels_1 = kmeans.fit(X).predict(X)
            labels_2 = kmeans.fit_predict(X)
    
>           assert_array_equal(labels_1, labels_2)
E           AssertionError: 
E           Arrays are not equal
E           
E           (mismatch 80.0%)
E            x: array([0, 5, 4, 1, 1, 2, 1, 2, 7, 4, 0, 3, 8, 9, 8, 3, 9, 4, 0, 5, 1, 1, 0,
E                  2, 5, 5, 9, 3, 2, 5, 7, 4, 1, 5, 0, 2, 0, 9, 1, 9, 4, 3, 1, 5, 4, 1,
E                  6, 3, 5, 9, 3, 9, 5, 4, 8, 2, 2, 0, 5, 7, 3, 7, 4, 9, 8, 6, 9, 0, 6,...
E            y: array([1, 0, 5, 2, 2, 3, 2, 3, 7, 5, 1, 4, 9, 8, 9, 4, 8, 5, 1, 0, 2, 2, 1,
E                  3, 0, 0, 8, 4, 3, 0, 7, 5, 2, 0, 1, 3, 1, 8, 2, 8, 5, 4, 2, 0, 5, 2,
E                  6, 4, 0, 8, 4, 8, 0, 5, 9, 3, 3, 1, 0, 7, 4, 7, 5, 8, 9, 6, 8, 1, 6,...
X          = <1000x10 sparse matrix of type '<class 'numpy.float64'>'
	with 10000 stored elements in Compressed Sparse Row format>
algo       = 'full'
constructor = <class 'scipy.sparse.csr.csr_matrix'>
dtype      = <class 'numpy.float64'>
kmeans     = KMeans(algorithm='full', copy_x=True, init='k-means++', max_iter=2,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=1e-07, verbose=0)
labels_1   = array([0, 5, 4, 1, 1, 2, 1, 2, 7, 4, 0, 3, 8, 9, 8, 3, 9, 4, 0, 5, 1, 1, 0,
       2, 5, 5, 9, 3, 2, 5, 7, 4, 1, 5, 0,...3, 5, 1, 3, 3, 2, 3, 5, 4, 8, 8, 0, 8, 1, 7, 3, 6, 2, 2, 6, 3, 3,
       3, 3, 8, 3, 7, 9, 8, 9, 5, 4, 2], dtype=int32)
labels_2   = array([1, 0, 5, 2, 2, 3, 2, 3, 7, 5, 1, 4, 9, 8, 9, 4, 8, 5, 1, 0, 2, 2, 1,
       3, 0, 0, 8, 4, 3, 0, 7, 5, 2, 0, 1,...4, 0, 2, 4, 4, 3, 4, 0, 5, 9, 9, 1, 9, 2, 7, 4, 6, 3, 3, 6, 4, 4,
       4, 4, 9, 4, 7, 8, 9, 8, 0, 5, 3], dtype=int32)
max_iter   = 2
rng        = <mtrand.RandomState object at 0x114933ea0>
seed       = 0
tol        = 1e-07
../venv/lib/python3.6/site-packages/sklearn/cluster/tests/test_k_means.py:352: AssertionError
_________ test_k_means_fit_predict[4-300-0.1-csr_matrix-float64-full] __________
algo = 'full', dtype = <class 'numpy.float64'>
constructor = <class 'scipy.sparse.csr.csr_matrix'>, seed = 4, max_iter = 300
tol = 0.1
    @pytest.mark.parametrize('algo', ['full', 'elkan'])
    @pytest.mark.parametrize('dtype', [np.float32, np.float64])
    @pytest.mark.parametrize('constructor', [np.asarray, sp.csr_matrix])
    @pytest.mark.parametrize('seed, max_iter, tol', [
        (0, 2, 1e-7),    # strict non-convergence
        (1, 2, 1e-1),    # loose non-convergence
        (3, 300, 1e-7),  # strict convergence
        (4, 300, 1e-1),  # loose convergence
    ])
    def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
        # check that fit.predict gives same result as fit_predict
        # There's a very small chance of failure with elkan on unstructured dataset
        # because predict method uses fast euclidean distances computation which
        # may cause small numerical instabilities.
        if not (algo == 'elkan' and constructor is sp.csr_matrix):
            rng = np.random.RandomState(seed)
    
            X = make_blobs(n_samples=1000, n_features=10, centers=10,
                           random_state=rng)[0].astype(dtype, copy=False)
            X = constructor(X)
    
            kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
                            tol=tol, max_iter=max_iter, n_jobs=1)
    
            labels_1 = kmeans.fit(X).predict(X)
            labels_2 = kmeans.fit_predict(X)
    
>           assert_array_equal(labels_1, labels_2)
E           AssertionError: 
E           Arrays are not equal
E           
E           (mismatch 100.0%)
E            x: array([7, 0, 7, 8, 8, 7, 8, 9, 2, 3, 6, 8, 9, 6, 9, 5, 1, 7, 0, 3, 9, 0, 6,
E                  3, 5, 5, 6, 3, 6, 1, 4, 7, 0, 4, 0, 6, 4, 6, 0, 4, 4, 9, 6, 1, 2, 0,
E                  2, 5, 1, 4, 9, 1, 5, 3, 9, 6, 6, 9, 9, 8, 7, 1, 6, 2, 7, 0, 9, 1, 3,...
E            y: array([9, 2, 9, 0, 0, 9, 0, 1, 8, 7, 4, 0, 1, 4, 1, 6, 5, 9, 2, 7, 1, 2, 4,
E                  7, 6, 6, 4, 7, 4, 5, 3, 9, 2, 3, 2, 4, 3, 4, 2, 3, 3, 1, 4, 5, 8, 2,
E                  8, 6, 5, 3, 1, 5, 6, 7, 1, 4, 4, 1, 1, 0, 9, 5, 4, 8, 9, 2, 1, 5, 7,...
X          = <1000x10 sparse matrix of type '<class 'numpy.float64'>'
	with 10000 stored elements in Compressed Sparse Row format>
algo       = 'full'
constructor = <class 'scipy.sparse.csr.csr_matrix'>
dtype      = <class 'numpy.float64'>
kmeans     = KMeans(algorithm='full', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=4, tol=0.1, verbose=0)
labels_1   = array([7, 0, 7, 8, 8, 7, 8, 9, 2, 3, 6, 8, 9, 6, 9, 5, 1, 7, 0, 3, 9, 0, 6,
       3, 5, 5, 6, 3, 6, 1, 4, 7, 0, 4, 0,...0, 0, 8, 5, 8, 2, 4, 7, 3, 3, 6, 8, 5, 7, 1, 2, 7, 1, 4, 9, 9, 5,
       4, 2, 2, 7, 5, 9, 8, 4, 9, 0, 1], dtype=int32)
labels_2   = array([9, 2, 9, 0, 0, 9, 0, 1, 8, 7, 4, 0, 1, 4, 1, 6, 5, 9, 2, 7, 1, 2, 4,
       7, 6, 6, 4, 7, 4, 5, 3, 9, 2, 3, 2,...2, 2, 0, 6, 0, 8, 3, 9, 7, 7, 4, 0, 6, 9, 5, 8, 9, 5, 3, 1, 1, 6,
       3, 8, 8, 9, 6, 1, 0, 3, 1, 2, 5], dtype=int32)
max_iter   = 300
rng        = <mtrand.RandomState object at 0x1141c9708>
seed       = 4
tol        = 0.1
../venv/lib/python3.6/site-packages/sklearn/cluster/tests/test_k_means.py:352: AssertionError

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions