diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh index f13e0f1bbb2fa..a9c8fb73f9552 100755 --- a/build_tools/travis/test_script.sh +++ b/build_tools/travis/test_script.sh @@ -30,10 +30,9 @@ run_tests() { cp setup.cfg $TEST_DIR cd $TEST_DIR - # Skip tests that require large downloads over the network to save bandwidth - # usage as travis workers are stateless and therefore traditional local - # disk caching does not work. - export SKLEARN_SKIP_NETWORK_TESTS=1 + # Tests that require large downloads over the networks are skipped in CI. + # Here we make sure, that they are still run on a regular basis. + export SKLEARN_SKIP_NETWORK_TESTS=0 if [[ "$COVERAGE" == "true" ]]; then TEST_CMD="$TEST_CMD --cov sklearn" diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py new file mode 100644 index 0000000000000..85242d7335685 --- /dev/null +++ b/sklearn/datasets/tests/conftest.py @@ -0,0 +1,61 @@ +""" Network tests are only run, if data is already locally available, +or if download is specifically requested by environment variable.""" +from os import environ +import pytest +from sklearn.datasets import fetch_20newsgroups +from sklearn.datasets import fetch_20newsgroups_vectorized +from sklearn.datasets import fetch_california_housing +from sklearn.datasets import fetch_covtype +from sklearn.datasets import fetch_kddcup99 +from sklearn.datasets import fetch_olivetti_faces +from sklearn.datasets import fetch_rcv1 + + +def _wrapped_fetch(f, dataset_name): + """ Fetch dataset (download if missing and requested by environment) """ + download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' + + def wrapped(*args, **kwargs): + kwargs['download_if_missing'] = download_if_missing + try: + return f(*args, **kwargs) + except IOError: + pytest.skip("Download {} to run this test".format(dataset_name)) + return wrapped + + +@pytest.fixture +def fetch_20newsgroups_fxt(): + return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups') + + +@pytest.fixture +def fetch_20newsgroups_vectorized_fxt(): + return _wrapped_fetch(fetch_20newsgroups_vectorized, + dataset_name='20newsgroups_vectorized') + + +@pytest.fixture +def fetch_california_housing_fxt(): + return _wrapped_fetch(fetch_california_housing, + dataset_name='california_housing') + + +@pytest.fixture +def fetch_covtype_fxt(): + return _wrapped_fetch(fetch_covtype, dataset_name='covtype') + + +@pytest.fixture +def fetch_kddcup99_fxt(): + return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99') + + +@pytest.fixture +def fetch_olivetti_faces_fxt(): + return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces') + + +@pytest.fixture +def fetch_rcv1_fxt(): + return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1') diff --git a/sklearn/datasets/tests/test_20news.py b/sklearn/datasets/tests/test_20news.py index 15cb49c44b0e5..f800a49238ec1 100644 --- a/sklearn/datasets/tests/test_20news.py +++ b/sklearn/datasets/tests/test_20news.py @@ -1,25 +1,21 @@ -"""Test the 20news downloader, if the data is available.""" +"""Test the 20news downloader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" from functools import partial import numpy as np import scipy.sparse as sp -from sklearn.utils._testing import SkipTest, assert_allclose_dense_sparse +from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.datasets.tests.test_common import check_return_X_y - -from sklearn import datasets from sklearn.preprocessing import normalize -def test_20news(): - try: - data = datasets.fetch_20newsgroups( - subset='all', download_if_missing=False, shuffle=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") +def test_20news(fetch_20newsgroups_fxt): + data = fetch_20newsgroups_fxt(subset='all', shuffle=False) # Extract a reduced dataset - data2cats = datasets.fetch_20newsgroups( + data2cats = fetch_20newsgroups_fxt( subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset @@ -40,72 +36,53 @@ def test_20news(): assert entry1 == entry2 # check that return_X_y option - X, y = datasets.fetch_20newsgroups( - subset='all', shuffle=False, return_X_y=True - ) + X, y = fetch_20newsgroups_fxt(subset='all', shuffle=False, return_X_y=True) assert len(X) == len(data.data) assert y.shape == data.target.shape -def test_20news_length_consistency(): +def test_20news_length_consistency(fetch_20newsgroups_fxt): """Checks the length consistencies within the bunch This is a non-regression test for a bug present in 0.16.1. """ - try: - data = datasets.fetch_20newsgroups( - subset='all', download_if_missing=False, shuffle=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") # Extract the full dataset - data = datasets.fetch_20newsgroups(subset='all') + data = fetch_20newsgroups_fxt(subset='all') assert len(data['data']) == len(data.data) assert len(data['target']) == len(data.target) assert len(data['filenames']) == len(data.filenames) -def test_20news_vectorized(): - try: - datasets.fetch_20newsgroups(subset='all', - download_if_missing=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") - +def test_20news_vectorized(fetch_20newsgroups_vectorized_fxt): # test subset = train - bunch = datasets.fetch_20newsgroups_vectorized(subset="train") + bunch = fetch_20newsgroups_vectorized_fxt(subset="train") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314, 130107) assert bunch.target.shape[0] == 11314 assert bunch.data.dtype == np.float64 # test subset = test - bunch = datasets.fetch_20newsgroups_vectorized(subset="test") + bunch = fetch_20newsgroups_vectorized_fxt(subset="test") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (7532, 130107) assert bunch.target.shape[0] == 7532 assert bunch.data.dtype == np.float64 # test return_X_y option - fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test') + fetch_func = partial(fetch_20newsgroups_vectorized_fxt, subset='test') check_return_X_y(bunch, fetch_func) # test subset = all - bunch = datasets.fetch_20newsgroups_vectorized(subset='all') + bunch = fetch_20newsgroups_vectorized_fxt(subset='all') assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314 + 7532, 130107) assert bunch.target.shape[0] == 11314 + 7532 assert bunch.data.dtype == np.float64 -def test_20news_normalization(): - try: - X = datasets.fetch_20newsgroups_vectorized(normalize=False, - download_if_missing=False) - X_ = datasets.fetch_20newsgroups_vectorized(normalize=True, - download_if_missing=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") - +def test_20news_normalization(fetch_20newsgroups_vectorized_fxt): + X = fetch_20newsgroups_vectorized_fxt(normalize=False) + X_ = fetch_20newsgroups_vectorized_fxt(normalize=True) X_norm = X_['data'][:100] X = X['data'][:100] diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index 6112bf966b303..af1e1ff1370e1 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -1,48 +1,25 @@ -"""Test the california_housing loader. - -Skipped if california_housing is not already downloaded to data_home. -""" - +"""Test the california_housing loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" import pytest -from sklearn.datasets import fetch_california_housing from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial -def fetch(*args, **kwargs): - return fetch_california_housing(*args, download_if_missing=False, **kwargs) - - -def _is_california_housing_dataset_not_available(): - try: - fetch_california_housing(download_if_missing=False) - return False - except IOError: - return True - - -@pytest.mark.skipif( - _is_california_housing_dataset_not_available(), - reason='Download California Housing dataset to run this test' -) -def test_fetch(): - data = fetch() +def test_fetch(fetch_california_housing_fxt): + data = fetch_california_housing_fxt() assert((20640, 8) == data.data.shape) assert((20640, ) == data.target.shape) # test return_X_y option - fetch_func = partial(fetch) + fetch_func = partial(fetch_california_housing_fxt) check_return_X_y(data, fetch_func) -@pytest.mark.skipif( - _is_california_housing_dataset_not_available(), - reason='Download California Housing dataset to run this test' -) -def test_fetch_asframe(): +def test_fetch_asframe(fetch_california_housing_fxt): pd = pytest.importorskip('pandas') - bunch = fetch(as_frame=True) + bunch = fetch_california_housing_fxt(as_frame=True) frame = bunch.frame assert hasattr(bunch, 'frame') is True assert frame.shape == (20640, 9) @@ -50,11 +27,7 @@ def test_fetch_asframe(): assert isinstance(bunch.target, pd.Series) -@pytest.mark.skipif( - _is_california_housing_dataset_not_available(), - reason='Download California Housing dataset to run this test' -) -def test_pandas_dependency_message(): +def test_pandas_dependency_message(fetch_california_housing_fxt): try: import pandas # noqa pytest.skip("This test requires pandas to be not installed") @@ -64,4 +37,4 @@ def test_pandas_dependency_message(): expected_msg = ('fetch_california_housing with as_frame=True' ' requires pandas') with pytest.raises(ImportError, match=expected_msg): - fetch_california_housing(as_frame=True) + fetch_california_housing_fxt(as_frame=True) diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index 1127b8114c5e7..d966e6c3890d0 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -1,25 +1,14 @@ -"""Test the covtype loader. +"""Test the covtype loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" -Skipped if covtype is not already downloaded to data_home. -""" - -from sklearn.datasets import fetch_covtype -from sklearn.utils._testing import SkipTest from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial -def fetch(*args, **kwargs): - return fetch_covtype(*args, download_if_missing=False, **kwargs) - - -def test_fetch(): - try: - data1 = fetch(shuffle=True, random_state=42) - except IOError: - raise SkipTest("Covertype dataset can not be loaded.") - - data2 = fetch(shuffle=True, random_state=37) +def test_fetch(fetch_covtype_fxt): + data1 = fetch_covtype_fxt(shuffle=True, random_state=42) + data2 = fetch_covtype_fxt(shuffle=True, random_state=37) X1, X2 = data1['data'], data2['data'] assert (581012, 54) == X1.shape @@ -32,5 +21,5 @@ def test_fetch(): assert (X1.shape[0],) == y2.shape # test return_X_y option - fetch_func = partial(fetch) + fetch_func = partial(fetch_covtype_fxt) check_return_X_y(data1, fetch_func) diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 6d371e5a8e6f0..899abd2bcb153 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -1,55 +1,46 @@ -"""Test kddcup99 loader. Only 'percent10' mode is tested, as the full data -is too big to use in unit-testing. +"""Test kddcup99 loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job). -The test is skipped if the data wasn't previously fetched and saved to -scikit-learn data folder. +Only 'percent10' mode is tested, as the full data +is too big to use in unit-testing. """ -from sklearn.datasets import fetch_kddcup99 from sklearn.datasets.tests.test_common import check_return_X_y -from sklearn.utils._testing import SkipTest from functools import partial - -def test_percent10(): - try: - data = fetch_kddcup99(download_if_missing=False) - except IOError: - raise SkipTest("kddcup99 dataset can not be loaded.") +def test_percent10(fetch_kddcup99_fxt): + data = fetch_kddcup99_fxt() assert data.data.shape == (494021, 41) assert data.target.shape == (494021,) - data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) + data_shuffled = fetch_kddcup99_fxt(shuffle=True, random_state=0) assert data.data.shape == data_shuffled.data.shape assert data.target.shape == data_shuffled.target.shape - data = fetch_kddcup99('SA') + data = fetch_kddcup99_fxt('SA') assert data.data.shape == (100655, 41) assert data.target.shape == (100655,) - data = fetch_kddcup99('SF') + data = fetch_kddcup99_fxt('SF') assert data.data.shape == (73237, 4) assert data.target.shape == (73237,) - data = fetch_kddcup99('http') + data = fetch_kddcup99_fxt('http') assert data.data.shape == (58725, 3) assert data.target.shape == (58725,) - data = fetch_kddcup99('smtp') + data = fetch_kddcup99_fxt('smtp') assert data.data.shape == (9571, 3) assert data.target.shape == (9571,) - fetch_func = partial(fetch_kddcup99, 'smtp') + fetch_func = partial(fetch_kddcup99_fxt, 'smtp') check_return_X_y(data, fetch_func) -def test_shuffle(): - try: - dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True, - percent10=True, download_if_missing=False) - except IOError: - raise SkipTest("kddcup99 dataset can not be loaded.") - +def test_shuffle(fetch_kddcup99_fxt): + dataset = fetch_kddcup99_fxt(random_state=0, subset='SA', shuffle=True, + percent10=True) assert(any(dataset.target[-100:] == b'normal.')) diff --git a/sklearn/datasets/tests/test_olivetti_faces.py b/sklearn/datasets/tests/test_olivetti_faces.py index 0162676c50af7..f0c7aa1216e76 100644 --- a/sklearn/datasets/tests/test_olivetti_faces.py +++ b/sklearn/datasets/tests/test_olivetti_faces.py @@ -1,28 +1,17 @@ -"""Test Olivetti faces fetcher, if the data is available.""" -import pytest +"""Test Olivetti faces fetcher, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" + import numpy as np -from sklearn import datasets from sklearn.utils import Bunch from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.utils._testing import assert_array_equal -def _is_olivetti_faces_not_available(): - try: - datasets.fetch_olivetti_faces(download_if_missing=False) - return False - except IOError: - return True - - -@pytest.mark.skipif( - _is_olivetti_faces_not_available(), - reason='Download Olivetti faces dataset to run this test' -) -def test_olivetti_faces(): - data = datasets.fetch_olivetti_faces(shuffle=True, random_state=0) +def test_olivetti_faces(fetch_olivetti_faces_fxt): + data = fetch_olivetti_faces_fxt(shuffle=True, random_state=0) assert isinstance(data, Bunch) for expected_keys in ('data', 'images', 'target', 'DESCR'): @@ -34,4 +23,4 @@ def test_olivetti_faces(): assert_array_equal(np.unique(np.sort(data.target)), np.arange(40)) # test the return_X_y option - check_return_X_y(data, datasets.fetch_olivetti_faces) + check_return_X_y(data, fetch_olivetti_faces_fxt) diff --git a/sklearn/datasets/tests/test_rcv1.py b/sklearn/datasets/tests/test_rcv1.py index 7cae454bf158b..2c21201dce40e 100644 --- a/sklearn/datasets/tests/test_rcv1.py +++ b/sklearn/datasets/tests/test_rcv1.py @@ -1,26 +1,17 @@ -"""Test the rcv1 loader. +"""Test the rcv1 loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" -Skipped if rcv1 is not already downloaded to data_home. -""" - -import errno import scipy.sparse as sp import numpy as np from functools import partial -from sklearn.datasets import fetch_rcv1 from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import SkipTest - -def test_fetch_rcv1(): - try: - data1 = fetch_rcv1(shuffle=False, download_if_missing=False) - except IOError as e: - if e.errno == errno.ENOENT: - raise SkipTest("Download RCV1 dataset to run this test.") +def test_fetch_rcv1(fetch_rcv1_fxt): + data1 = fetch_rcv1_fxt(shuffle=False) X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id @@ -48,14 +39,12 @@ def test_fetch_rcv1(): assert num == Y1[:, j].data.size # test shuffling and subset - data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77, - download_if_missing=False) + data2 = fetch_rcv1_fxt(shuffle=True, subset='train', random_state=77) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option - fetch_func = partial(fetch_rcv1, shuffle=False, subset='train', - download_if_missing=False) + fetch_func = partial(fetch_rcv1_fxt, shuffle=False, subset='train') check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples