From 9f6648905fbdc387d01fa8c029c23616798cedd2 Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Fri, 31 Jan 2020 16:56:05 +0100 Subject: [PATCH 01/15] download and test rcv1 in cron job --- .travis.yml | 2 +- build_tools/travis/test_script.sh | 7 +++---- sklearn/datasets/tests/test_rcv1.py | 8 +++++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9fda90f71a7c0..03d0d4b5b638b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ -# make it explicit that we favor the new container-based travis workers + # make it explicit that we favor the new container-based travis workers language: python cache: diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh index f13e0f1bbb2fa..a9c8fb73f9552 100755 --- a/build_tools/travis/test_script.sh +++ b/build_tools/travis/test_script.sh @@ -30,10 +30,9 @@ run_tests() { cp setup.cfg $TEST_DIR cd $TEST_DIR - # Skip tests that require large downloads over the network to save bandwidth - # usage as travis workers are stateless and therefore traditional local - # disk caching does not work. - export SKLEARN_SKIP_NETWORK_TESTS=1 + # Tests that require large downloads over the networks are skipped in CI. + # Here we make sure, that they are still run on a regular basis. + export SKLEARN_SKIP_NETWORK_TESTS=0 if [[ "$COVERAGE" == "true" ]]; then TEST_CMD="$TEST_CMD --cov sklearn" diff --git a/sklearn/datasets/tests/test_rcv1.py b/sklearn/datasets/tests/test_rcv1.py index 7cae454bf158b..3738f2ba4ff87 100644 --- a/sklearn/datasets/tests/test_rcv1.py +++ b/sklearn/datasets/tests/test_rcv1.py @@ -4,6 +4,7 @@ """ import errno +import os import scipy.sparse as sp import numpy as np from functools import partial @@ -15,8 +16,13 @@ def test_fetch_rcv1(): + # Do not download data, unless explicitly requested via environment var + download_if_missing = False + if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: + download_if_missing = True try: - data1 = fetch_rcv1(shuffle=False, download_if_missing=False) + data1 = fetch_rcv1(shuffle=False, + download_if_missing=download_if_missing) except IOError as e: if e.errno == errno.ENOENT: raise SkipTest("Download RCV1 dataset to run this test.") From c9d26fd63bcb2a38bb0eeae95d0e3c51df60430d Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Fri, 31 Jan 2020 17:01:09 +0100 Subject: [PATCH 02/15] download and test 20news in cron job [scipy-dev] --- sklearn/datasets/tests/test_20news.py | 12 ++++++++++-- sklearn/datasets/tests/test_rcv1.py | 7 +++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/sklearn/datasets/tests/test_20news.py b/sklearn/datasets/tests/test_20news.py index 15cb49c44b0e5..72b5dad97d7fe 100644 --- a/sklearn/datasets/tests/test_20news.py +++ b/sklearn/datasets/tests/test_20news.py @@ -1,5 +1,8 @@ -"""Test the 20news downloader, if the data is available.""" +"""Test the 20news downloader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" from functools import partial +import os import numpy as np import scipy.sparse as sp @@ -12,9 +15,14 @@ def test_20news(): + # Do not download data, unless explicitly requested via environment var + download_if_missing = False + if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: + download_if_missing = True try: data = datasets.fetch_20newsgroups( - subset='all', download_if_missing=False, shuffle=False) + subset='all', download_if_missing=download_if_missing, + shuffle=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") diff --git a/sklearn/datasets/tests/test_rcv1.py b/sklearn/datasets/tests/test_rcv1.py index 3738f2ba4ff87..f733a8879be23 100644 --- a/sklearn/datasets/tests/test_rcv1.py +++ b/sklearn/datasets/tests/test_rcv1.py @@ -1,7 +1,6 @@ -"""Test the rcv1 loader. - -Skipped if rcv1 is not already downloaded to data_home. -""" +"""Test the rcv1 loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" import errno import os From 47072fb55a914728ce1005e7126020d50a7a496f Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Fri, 31 Jan 2020 17:05:02 +0100 Subject: [PATCH 03/15] fix typo [scipy-dev] --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 03d0d4b5b638b..9fda90f71a7c0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ - # make it explicit that we favor the new container-based travis workers +# make it explicit that we favor the new container-based travis workers language: python cache: From dff956aff9c091c425ea57cf9ef8757bf5796e06 Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Fri, 31 Jan 2020 17:30:01 +0100 Subject: [PATCH 04/15] california_housing, covtype, kddcup99, olivetti_faces [scipy-dev] --- sklearn/datasets/tests/test_california_housing.py | 14 +++++++++----- sklearn/datasets/tests/test_covtype.py | 15 ++++++++++----- sklearn/datasets/tests/test_kddcup99.py | 15 ++++++++++----- sklearn/datasets/tests/test_olivetti_faces.py | 12 ++++++++++-- 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index 56cd62ef8bc35..1b7d06922608f 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -1,8 +1,8 @@ -"""Test the california_housing loader. - -Skipped if california_housing is not already downloaded to data_home. -""" +"""Test the california_housing loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" +import os import pytest from sklearn.datasets import fetch_california_housing @@ -15,8 +15,12 @@ def fetch(*args, **kwargs): def _is_california_housing_dataset_not_available(): + # Do not download data, unless explicitly requested via environment var + download_if_missing = False + if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: + download_if_missing = True try: - fetch_california_housing(download_if_missing=False) + fetch_california_housing(download_if_missing=download_if_missing) return False except IOError: return True diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index 1127b8114c5e7..efb195d38fe66 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -1,8 +1,8 @@ -"""Test the covtype loader. - -Skipped if covtype is not already downloaded to data_home. -""" +"""Test the covtype loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" +import os from sklearn.datasets import fetch_covtype from sklearn.utils._testing import SkipTest from sklearn.datasets.tests.test_common import check_return_X_y @@ -10,7 +10,12 @@ def fetch(*args, **kwargs): - return fetch_covtype(*args, download_if_missing=False, **kwargs) + # Do not download data, unless explicitly requested via environment var + download_if_missing = False + if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: + download_if_missing = True + return fetch_covtype(*args, download_if_missing=download_if_missing, + **kwargs) def test_fetch(): diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 6d371e5a8e6f0..d367eac4eadf5 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -1,18 +1,23 @@ -"""Test kddcup99 loader. Only 'percent10' mode is tested, as the full data -is too big to use in unit-testing. +"""Test kddcup99 loader, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job). -The test is skipped if the data wasn't previously fetched and saved to -scikit-learn data folder. +Only 'percent10' mode is tested, as the full data +is too big to use in unit-testing. """ +import os from sklearn.datasets import fetch_kddcup99 from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.utils._testing import SkipTest from functools import partial - def test_percent10(): + # Do not download data, unless explicitly requested via environment var + download_if_missing = False + if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: + download_if_missing = True try: data = fetch_kddcup99(download_if_missing=False) except IOError: diff --git a/sklearn/datasets/tests/test_olivetti_faces.py b/sklearn/datasets/tests/test_olivetti_faces.py index 0162676c50af7..5e91df96128c6 100644 --- a/sklearn/datasets/tests/test_olivetti_faces.py +++ b/sklearn/datasets/tests/test_olivetti_faces.py @@ -1,4 +1,8 @@ -"""Test Olivetti faces fetcher, if the data is available.""" +"""Test Olivetti faces fetcher, if the data is available, +or if specifically requested via environment variable +(e.g. for travis cron job).""" + +import os import pytest import numpy as np @@ -10,8 +14,12 @@ def _is_olivetti_faces_not_available(): + # Do not download data, unless explicitly requested via environment var + download_if_missing = False + if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: + download_if_missing = True try: - datasets.fetch_olivetti_faces(download_if_missing=False) + datasets.fetch_olivetti_faces(download_if_missing=download_if_missing) return False except IOError: return True From a1ad4df0fa670b7b2577bd4802780e27a9575e68 Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Fri, 31 Jan 2020 17:36:21 +0100 Subject: [PATCH 05/15] fix kddcup99 [scipy-dev] --- sklearn/datasets/tests/test_kddcup99.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index d367eac4eadf5..01931ba27ed84 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -19,7 +19,7 @@ def test_percent10(): if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: download_if_missing = True try: - data = fetch_kddcup99(download_if_missing=False) + data = fetch_kddcup99(download_if_missing=download_if_missing) except IOError: raise SkipTest("kddcup99 dataset can not be loaded.") From 99c02f2ef80e5d6c8f240a8df68aac76ea1e2be1 Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Fri, 14 Feb 2020 13:01:26 +0100 Subject: [PATCH 06/15] fetch datasets in wrapper func [scipy-dev] --- sklearn/datasets/tests/test_20news.py | 69 +++++++++---------- .../datasets/tests/test_california_housing.py | 10 ++- sklearn/datasets/tests/test_covtype.py | 23 +++---- sklearn/datasets/tests/test_kddcup99.py | 43 ++++++------ sklearn/datasets/tests/test_olivetti_faces.py | 6 +- sklearn/datasets/tests/test_rcv1.py | 30 ++++---- 6 files changed, 88 insertions(+), 93 deletions(-) diff --git a/sklearn/datasets/tests/test_20news.py b/sklearn/datasets/tests/test_20news.py index 72b5dad97d7fe..76453d7fbc636 100644 --- a/sklearn/datasets/tests/test_20news.py +++ b/sklearn/datasets/tests/test_20news.py @@ -2,32 +2,40 @@ or if specifically requested via environment variable (e.g. for travis cron job).""" from functools import partial -import os +from os import environ +import pytest import numpy as np import scipy.sparse as sp -from sklearn.utils._testing import SkipTest, assert_allclose_dense_sparse +from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.datasets.tests.test_common import check_return_X_y from sklearn import datasets from sklearn.preprocessing import normalize -def test_20news(): +def _fetch_20newsgroups(vectorized=False, *args, **kwargs): # Do not download data, unless explicitly requested via environment var - download_if_missing = False - if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: - download_if_missing = True + download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' try: - data = datasets.fetch_20newsgroups( - subset='all', download_if_missing=download_if_missing, - shuffle=False) + if vectorized: + return datasets.fetch_20newsgroups_vectorized( + *args, download_if_missing=download_if_missing, **kwargs) + else: + return datasets.fetch_20newsgroups( + *args, download_if_missing=download_if_missing, **kwargs) except IOError: - raise SkipTest("Download 20 newsgroups to run this test") + return None + + +@pytest.mark.skipif(_fetch_20newsgroups() is None, + reason="Download 20 newsgroups to run this test") +def test_20news(): + data = _fetch_20newsgroups(subset='all', shuffle=False) # Extract a reduced dataset - data2cats = datasets.fetch_20newsgroups( + data2cats = _fetch_20newsgroups( subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset @@ -48,72 +56,61 @@ def test_20news(): assert entry1 == entry2 # check that return_X_y option - X, y = datasets.fetch_20newsgroups( + X, y = _fetch_20newsgroups( subset='all', shuffle=False, return_X_y=True ) assert len(X) == len(data.data) assert y.shape == data.target.shape +@pytest.mark.skipif(_fetch_20newsgroups() is None, + reason="Download 20 newsgroups to run this test") def test_20news_length_consistency(): """Checks the length consistencies within the bunch This is a non-regression test for a bug present in 0.16.1. """ - try: - data = datasets.fetch_20newsgroups( - subset='all', download_if_missing=False, shuffle=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") # Extract the full dataset - data = datasets.fetch_20newsgroups(subset='all') + data = _fetch_20newsgroups(subset='all') assert len(data['data']) == len(data.data) assert len(data['target']) == len(data.target) assert len(data['filenames']) == len(data.filenames) +@pytest.mark.skipif(_fetch_20newsgroups(vectorized=True) is None, + reason="Download 20 news vectorized to run this test") def test_20news_vectorized(): - try: - datasets.fetch_20newsgroups(subset='all', - download_if_missing=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") - # test subset = train - bunch = datasets.fetch_20newsgroups_vectorized(subset="train") + bunch = _fetch_20newsgroups(vectorized=True, subset="train") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314, 130107) assert bunch.target.shape[0] == 11314 assert bunch.data.dtype == np.float64 # test subset = test - bunch = datasets.fetch_20newsgroups_vectorized(subset="test") + bunch = _fetch_20newsgroups(vectorized=True, subset="test") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (7532, 130107) assert bunch.target.shape[0] == 7532 assert bunch.data.dtype == np.float64 # test return_X_y option - fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test') + fetch_func = partial(_fetch_20newsgroups, vectorized=True, subset='test') check_return_X_y(bunch, fetch_func) # test subset = all - bunch = datasets.fetch_20newsgroups_vectorized(subset='all') + bunch = _fetch_20newsgroups(vectorized=True, subset='all') assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314 + 7532, 130107) assert bunch.target.shape[0] == 11314 + 7532 assert bunch.data.dtype == np.float64 +@pytest.mark.skipif(_fetch_20newsgroups(vectorized=True) is None, + reason="Download 20 news vectorized to run this test") def test_20news_normalization(): - try: - X = datasets.fetch_20newsgroups_vectorized(normalize=False, - download_if_missing=False) - X_ = datasets.fetch_20newsgroups_vectorized(normalize=True, - download_if_missing=False) - except IOError: - raise SkipTest("Download 20 newsgroups to run this test") - + X = _fetch_20newsgroups(vectorized=True, normalize=False) + X_ = _fetch_20newsgroups(vectorized=True, normalize=True) X_norm = X_['data'][:100] X = X['data'][:100] diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index 1b7d06922608f..482492316cf68 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -2,7 +2,7 @@ or if specifically requested via environment variable (e.g. for travis cron job).""" -import os +from os import environ import pytest from sklearn.datasets import fetch_california_housing @@ -11,16 +11,14 @@ def fetch(*args, **kwargs): - return fetch_california_housing(*args, download_if_missing=False, **kwargs) + return fetch_california_housing(*args, **kwargs) def _is_california_housing_dataset_not_available(): # Do not download data, unless explicitly requested via environment var - download_if_missing = False - if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: - download_if_missing = True + download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' try: - fetch_california_housing(download_if_missing=download_if_missing) + fetch(download_if_missing=download_if_missing) return False except IOError: return True diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index efb195d38fe66..386445807caac 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -2,28 +2,27 @@ or if specifically requested via environment variable (e.g. for travis cron job).""" -import os +from os import environ +import pytest from sklearn.datasets import fetch_covtype -from sklearn.utils._testing import SkipTest from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial def fetch(*args, **kwargs): # Do not download data, unless explicitly requested via environment var - download_if_missing = False - if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: - download_if_missing = True - return fetch_covtype(*args, download_if_missing=download_if_missing, - **kwargs) - - -def test_fetch(): + download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' try: - data1 = fetch(shuffle=True, random_state=42) + return fetch_covtype(*args, download_if_missing=download_if_missing, + **kwargs) except IOError: - raise SkipTest("Covertype dataset can not be loaded.") + return None + +@pytest.mark.skipif(fetch() is None, + reason="Download covtype to run this test") +def test_fetch(): + data1 = fetch(shuffle=True, random_state=42) data2 = fetch(shuffle=True, random_state=37) X1, X2 = data1['data'], data2['data'] diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 01931ba27ed84..74668413efbf3 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -6,55 +6,58 @@ is too big to use in unit-testing. """ -import os +from os import environ +import pytest from sklearn.datasets import fetch_kddcup99 from sklearn.datasets.tests.test_common import check_return_X_y -from sklearn.utils._testing import SkipTest from functools import partial -def test_percent10(): +def _fetch_dataset(*args, **kwargs): # Do not download data, unless explicitly requested via environment var - download_if_missing = False - if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: - download_if_missing = True + download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' try: - data = fetch_kddcup99(download_if_missing=download_if_missing) + return fetch_kddcup99(*args, download_if_missing=download_if_missing, + **kwargs) except IOError: - raise SkipTest("kddcup99 dataset can not be loaded.") + return None + + +@pytest.mark.skipif(_fetch_dataset() is None, + reason="Download kddcup99 to run this test") +def test_percent10(): + data = _fetch_dataset() assert data.data.shape == (494021, 41) assert data.target.shape == (494021,) - data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) + data_shuffled = _fetch_dataset(shuffle=True, random_state=0) assert data.data.shape == data_shuffled.data.shape assert data.target.shape == data_shuffled.target.shape - data = fetch_kddcup99('SA') + data = _fetch_dataset('SA') assert data.data.shape == (100655, 41) assert data.target.shape == (100655,) - data = fetch_kddcup99('SF') + data = _fetch_dataset('SF') assert data.data.shape == (73237, 4) assert data.target.shape == (73237,) - data = fetch_kddcup99('http') + data = _fetch_dataset('http') assert data.data.shape == (58725, 3) assert data.target.shape == (58725,) - data = fetch_kddcup99('smtp') + data = _fetch_dataset('smtp') assert data.data.shape == (9571, 3) assert data.target.shape == (9571,) - fetch_func = partial(fetch_kddcup99, 'smtp') + fetch_func = partial(_fetch_dataset, 'smtp') check_return_X_y(data, fetch_func) +@pytest.mark.skipif(_fetch_dataset() is None, + reason="Download kddcup99 to run this test") def test_shuffle(): - try: - dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True, - percent10=True, download_if_missing=False) - except IOError: - raise SkipTest("kddcup99 dataset can not be loaded.") - + dataset = _fetch_dataset(random_state=0, subset='SA', shuffle=True, + percent10=True) assert(any(dataset.target[-100:] == b'normal.')) diff --git a/sklearn/datasets/tests/test_olivetti_faces.py b/sklearn/datasets/tests/test_olivetti_faces.py index 5e91df96128c6..2d7e53815eef1 100644 --- a/sklearn/datasets/tests/test_olivetti_faces.py +++ b/sklearn/datasets/tests/test_olivetti_faces.py @@ -2,7 +2,7 @@ or if specifically requested via environment variable (e.g. for travis cron job).""" -import os +from os import environ import pytest import numpy as np @@ -15,9 +15,7 @@ def _is_olivetti_faces_not_available(): # Do not download data, unless explicitly requested via environment var - download_if_missing = False - if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: - download_if_missing = True + download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' try: datasets.fetch_olivetti_faces(download_if_missing=download_if_missing) return False diff --git a/sklearn/datasets/tests/test_rcv1.py b/sklearn/datasets/tests/test_rcv1.py index f733a8879be23..ad060dc22e4ae 100644 --- a/sklearn/datasets/tests/test_rcv1.py +++ b/sklearn/datasets/tests/test_rcv1.py @@ -2,8 +2,8 @@ or if specifically requested via environment variable (e.g. for travis cron job).""" -import errno -import os +from os import environ +import pytest import scipy.sparse as sp import numpy as np from functools import partial @@ -11,21 +11,22 @@ from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import SkipTest -def test_fetch_rcv1(): +def _fetch_data(*args, **kwargs): # Do not download data, unless explicitly requested via environment var - download_if_missing = False - if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 1)) == 0: - download_if_missing = True + download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' try: - data1 = fetch_rcv1(shuffle=False, - download_if_missing=download_if_missing) - except IOError as e: - if e.errno == errno.ENOENT: - raise SkipTest("Download RCV1 dataset to run this test.") + return fetch_rcv1(*args, download_if_missing=download_if_missing, + **kwargs) + except IOError: + return None + +@pytest.mark.skipif(_fetch_data() is None, + reason="Download RCV1 to run this test") +def test_fetch_rcv1(): + data1 = _fetch_data(shuffle=False) X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id @@ -53,13 +54,12 @@ def test_fetch_rcv1(): assert num == Y1[:, j].data.size # test shuffling and subset - data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77, - download_if_missing=False) + data2 = _fetch_data(shuffle=True, subset='train', random_state=77) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option - fetch_func = partial(fetch_rcv1, shuffle=False, subset='train', + fetch_func = partial(_fetch_data, shuffle=False, subset='train', download_if_missing=False) check_return_X_y(data2, fetch_func) From 4d3499c55d140d2b55ef9208f8b9ab39acc632ef Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Fri, 14 Feb 2020 13:31:45 +0100 Subject: [PATCH 07/15] fix rcv1 test [scipy-dev] --- sklearn/datasets/tests/test_rcv1.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_rcv1.py b/sklearn/datasets/tests/test_rcv1.py index ad060dc22e4ae..bb93051029192 100644 --- a/sklearn/datasets/tests/test_rcv1.py +++ b/sklearn/datasets/tests/test_rcv1.py @@ -59,8 +59,7 @@ def test_fetch_rcv1(): s2 = data2.sample_id # test return_X_y option - fetch_func = partial(_fetch_data, shuffle=False, subset='train', - download_if_missing=False) + fetch_func = partial(_fetch_data, shuffle=False, subset='train') check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples From ec4db968dc9a05afd7dcc78f55bd37af9010bf54 Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Fri, 14 Feb 2020 16:37:35 +0100 Subject: [PATCH 08/15] do not skip test_pandas_dependency_message [scipy-dev] --- .../datasets/tests/test_california_housing.py | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index 482492316cf68..614c00c54b831 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -2,6 +2,7 @@ or if specifically requested via environment variable (e.g. for travis cron job).""" +import builtins from os import environ import pytest @@ -52,18 +53,28 @@ def test_fetch_asframe(): assert isinstance(bunch.target, pd.DataFrame) +@pytest.fixture +def hide_available_pandas(monkeypatch): + """ Pretend pandas was not installed. """ + import_orig = builtins.__import__ + + def mocked_import(name, *args, **kwargs): + if name == 'pandas': + raise ImportError() + return import_orig(name, *args, **kwargs) + + monkeypatch.setattr(builtins, '__import__', mocked_import) + + @pytest.mark.skipif( _is_california_housing_dataset_not_available(), reason='Download California Housing dataset to run this test' ) +@pytest.mark.usefixtures('hide_available_pandas') def test_pandas_dependency_message(): - try: - import pandas # noqa - pytest.skip("This test requires pandas to be not installed") - except ImportError: - # Check that pandas is imported lazily and that an informative error - # message is raised when pandas is missing: - expected_msg = ('fetch_california_housing with as_frame=True' - ' requires pandas') - with pytest.raises(ImportError, match=expected_msg): - fetch_california_housing(as_frame=True) + # Check that pandas is imported lazily and that an informative error + # message is raised when pandas is missing: + expected_msg = ('fetch_california_housing with as_frame=True' + ' requires pandas') + with pytest.raises(ImportError, match=expected_msg): + fetch_california_housing(as_frame=True) From e7081fa1b1db759589d08cdd297d033d676eed28 Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Mon, 17 Feb 2020 14:43:33 +0100 Subject: [PATCH 09/15] introduce fetch fixtures [scipy-dev] --- sklearn/datasets/tests/conftest.py | 78 +++++++++++++++++++ sklearn/datasets/tests/test_20news.py | 56 ++++--------- sklearn/datasets/tests/test_base.py | 1 + .../datasets/tests/test_california_housing.py | 55 ++----------- sklearn/datasets/tests/test_covtype.py | 23 +----- sklearn/datasets/tests/test_kddcup99.py | 37 +++------ sklearn/datasets/tests/test_olivetti_faces.py | 23 +----- sklearn/datasets/tests/test_rcv1.py | 23 +----- 8 files changed, 120 insertions(+), 176 deletions(-) create mode 100644 sklearn/datasets/tests/conftest.py diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py new file mode 100644 index 0000000000000..fddc22dd80c8f --- /dev/null +++ b/sklearn/datasets/tests/conftest.py @@ -0,0 +1,78 @@ +""" Network tests are only run, if data is already locally available, +or if download is specifically requested by environment variable.""" +import builtins +from os import environ +import pytest +from sklearn.datasets import ( + fetch_20newsgroups as _fetch_20newsgroups, + fetch_20newsgroups_vectorized as _fetch_20newsgroups_vectorized, + fetch_california_housing as _fetch_california_housing, + fetch_covtype as _fetch_covtype, + fetch_kddcup99 as _fetch_kddcup99, + fetch_olivetti_faces as _fetch_olivetti_faces, + fetch_rcv1 as _fetch_rcv1, +) + + +def _wrapped_fetch(f, dataset_name): + """ Fetch dataset (download if missing and requested by environment) """ + download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' + + def wrapped(*args, **kwargs): + kwargs['download_if_missing'] = download_if_missing + try: + return f(*args, **kwargs) + except IOError: + pytest.skip("Download {} to run this test".format(dataset_name)) + + return wrapped + + +@pytest.fixture +def fetch_20newsgroups(): + return _wrapped_fetch(_fetch_20newsgroups, dataset_name='20newsgroups') + + +@pytest.fixture +def fetch_20newsgroups_vectorized(): + return _wrapped_fetch(_fetch_20newsgroups_vectorized, + dataset_name='20newsgroups_vectorized') + + +@pytest.fixture +def fetch_california_housing(): + return _wrapped_fetch(_fetch_california_housing, + dataset_name='california_housing') + + +@pytest.fixture +def fetch_covtype(): + return _wrapped_fetch(_fetch_covtype, dataset_name='covtype') + + +@pytest.fixture +def fetch_kddcup99(): + return _wrapped_fetch(_fetch_kddcup99, dataset_name='kddcup99') + + +@pytest.fixture +def fetch_olivetti_faces(): + return _wrapped_fetch(_fetch_olivetti_faces, dataset_name='olivetti_faces') + + +@pytest.fixture +def fetch_rcv1(): + return _wrapped_fetch(_fetch_rcv1, dataset_name='rcv1') + + +@pytest.fixture +def hide_available_pandas(monkeypatch): + """ Pretend pandas was not installed. """ + import_orig = builtins.__import__ + + def mocked_import(name, *args, **kwargs): + if name == 'pandas': + raise ImportError() + return import_orig(name, *args, **kwargs) + + monkeypatch.setattr(builtins, '__import__', mocked_import) diff --git a/sklearn/datasets/tests/test_20news.py b/sklearn/datasets/tests/test_20news.py index 76453d7fbc636..d56d05579e311 100644 --- a/sklearn/datasets/tests/test_20news.py +++ b/sklearn/datasets/tests/test_20news.py @@ -2,40 +2,20 @@ or if specifically requested via environment variable (e.g. for travis cron job).""" from functools import partial -from os import environ -import pytest import numpy as np import scipy.sparse as sp from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.datasets.tests.test_common import check_return_X_y - -from sklearn import datasets from sklearn.preprocessing import normalize -def _fetch_20newsgroups(vectorized=False, *args, **kwargs): - # Do not download data, unless explicitly requested via environment var - download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' - try: - if vectorized: - return datasets.fetch_20newsgroups_vectorized( - *args, download_if_missing=download_if_missing, **kwargs) - else: - return datasets.fetch_20newsgroups( - *args, download_if_missing=download_if_missing, **kwargs) - except IOError: - return None - - -@pytest.mark.skipif(_fetch_20newsgroups() is None, - reason="Download 20 newsgroups to run this test") -def test_20news(): - data = _fetch_20newsgroups(subset='all', shuffle=False) +def test_20news(fetch_20newsgroups): + data = fetch_20newsgroups(subset='all', shuffle=False) # Extract a reduced dataset - data2cats = _fetch_20newsgroups( + data2cats = fetch_20newsgroups( subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset @@ -56,61 +36,53 @@ def test_20news(): assert entry1 == entry2 # check that return_X_y option - X, y = _fetch_20newsgroups( - subset='all', shuffle=False, return_X_y=True - ) + X, y = fetch_20newsgroups(subset='all', shuffle=False, return_X_y=True) assert len(X) == len(data.data) assert y.shape == data.target.shape -@pytest.mark.skipif(_fetch_20newsgroups() is None, - reason="Download 20 newsgroups to run this test") -def test_20news_length_consistency(): +def test_20news_length_consistency(fetch_20newsgroups): """Checks the length consistencies within the bunch This is a non-regression test for a bug present in 0.16.1. """ # Extract the full dataset - data = _fetch_20newsgroups(subset='all') + data = fetch_20newsgroups(subset='all') assert len(data['data']) == len(data.data) assert len(data['target']) == len(data.target) assert len(data['filenames']) == len(data.filenames) -@pytest.mark.skipif(_fetch_20newsgroups(vectorized=True) is None, - reason="Download 20 news vectorized to run this test") -def test_20news_vectorized(): +def test_20news_vectorized(fetch_20newsgroups_vectorized): # test subset = train - bunch = _fetch_20newsgroups(vectorized=True, subset="train") + bunch = fetch_20newsgroups_vectorized(subset="train") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314, 130107) assert bunch.target.shape[0] == 11314 assert bunch.data.dtype == np.float64 # test subset = test - bunch = _fetch_20newsgroups(vectorized=True, subset="test") + bunch = fetch_20newsgroups_vectorized(subset="test") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (7532, 130107) assert bunch.target.shape[0] == 7532 assert bunch.data.dtype == np.float64 # test return_X_y option - fetch_func = partial(_fetch_20newsgroups, vectorized=True, subset='test') + fetch_func = partial(fetch_20newsgroups_vectorized, subset='test') check_return_X_y(bunch, fetch_func) # test subset = all - bunch = _fetch_20newsgroups(vectorized=True, subset='all') + bunch = fetch_20newsgroups_vectorized(subset='all') assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314 + 7532, 130107) assert bunch.target.shape[0] == 11314 + 7532 assert bunch.data.dtype == np.float64 -@pytest.mark.skipif(_fetch_20newsgroups(vectorized=True) is None, - reason="Download 20 news vectorized to run this test") -def test_20news_normalization(): - X = _fetch_20newsgroups(vectorized=True, normalize=False) - X_ = _fetch_20newsgroups(vectorized=True, normalize=True) +def test_20news_normalization(fetch_20newsgroups_vectorized): + X = fetch_20newsgroups_vectorized(normalize=False) + X_ = fetch_20newsgroups_vectorized(normalize=True) X_norm = X_['data'][:100] X = X['data'][:100] diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index a58bdc9ed644d..f3e7769d0d0f3 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -257,6 +257,7 @@ def test_toy_dataset_as_frame(loader_func, data_dtype, target_dtype): load_linnerud, load_wine, ]) +@pytest.mark.usefixtures('hide_available_pandas') def test_toy_dataset_as_frame_no_pandas(loader_func): check_pandas_dependency_message(loader_func) diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index 5400cab210070..ced4a20fc902d 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -1,51 +1,25 @@ """Test the california_housing loader, if the data is available, or if specifically requested via environment variable (e.g. for travis cron job).""" - -import builtins -from os import environ import pytest -from sklearn.datasets import fetch_california_housing from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial -def fetch(*args, **kwargs): - return fetch_california_housing(*args, **kwargs) - - -def _is_california_housing_dataset_not_available(): - # Do not download data, unless explicitly requested via environment var - download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' - try: - fetch(download_if_missing=download_if_missing) - return False - except IOError: - return True - - -@pytest.mark.skipif( - _is_california_housing_dataset_not_available(), - reason='Download California Housing dataset to run this test' -) -def test_fetch(): - data = fetch() +def test_fetch(fetch_california_housing): + data = fetch_california_housing() assert((20640, 8) == data.data.shape) assert((20640, ) == data.target.shape) # test return_X_y option - fetch_func = partial(fetch) + fetch_func = partial(fetch_california_housing) check_return_X_y(data, fetch_func) -@pytest.mark.skipif( - _is_california_housing_dataset_not_available(), - reason='Download California Housing dataset to run this test' -) -def test_fetch_asframe(): +def test_fetch_asframe(fetch_california_housing): pd = pytest.importorskip('pandas') - bunch = fetch(as_frame=True) + bunch = fetch_california_housing(as_frame=True) frame = bunch.frame assert hasattr(bunch, 'frame') is True assert frame.shape == (20640, 9) @@ -53,25 +27,8 @@ def test_fetch_asframe(): assert isinstance(bunch.target, pd.Series) -@pytest.fixture -def hide_available_pandas(monkeypatch): - """ Pretend pandas was not installed. """ - import_orig = builtins.__import__ - - def mocked_import(name, *args, **kwargs): - if name == 'pandas': - raise ImportError() - return import_orig(name, *args, **kwargs) - - monkeypatch.setattr(builtins, '__import__', mocked_import) - - -@pytest.mark.skipif( - _is_california_housing_dataset_not_available(), - reason='Download California Housing dataset to run this test' -) @pytest.mark.usefixtures('hide_available_pandas') -def test_pandas_dependency_message(): +def test_pandas_dependency_message(fetch_california_housing): # Check that pandas is imported lazily and that an informative error # message is raised when pandas is missing: expected_msg = ('fetch_california_housing with as_frame=True' diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index 386445807caac..1b127b0100f1f 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -2,28 +2,13 @@ or if specifically requested via environment variable (e.g. for travis cron job).""" -from os import environ -import pytest -from sklearn.datasets import fetch_covtype from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial -def fetch(*args, **kwargs): - # Do not download data, unless explicitly requested via environment var - download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' - try: - return fetch_covtype(*args, download_if_missing=download_if_missing, - **kwargs) - except IOError: - return None - - -@pytest.mark.skipif(fetch() is None, - reason="Download covtype to run this test") -def test_fetch(): - data1 = fetch(shuffle=True, random_state=42) - data2 = fetch(shuffle=True, random_state=37) +def test_fetch(fetch_covtype): + data1 = fetch_covtype(shuffle=True, random_state=42) + data2 = fetch_covtype(shuffle=True, random_state=37) X1, X2 = data1['data'], data2['data'] assert (581012, 54) == X1.shape @@ -36,5 +21,5 @@ def test_fetch(): assert (X1.shape[0],) == y2.shape # test return_X_y option - fetch_func = partial(fetch) + fetch_func = partial(fetch_covtype) check_return_X_y(data1, fetch_func) diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 74668413efbf3..f0827290ce2e8 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -6,58 +6,41 @@ is too big to use in unit-testing. """ -from os import environ -import pytest -from sklearn.datasets import fetch_kddcup99 from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial -def _fetch_dataset(*args, **kwargs): - # Do not download data, unless explicitly requested via environment var - download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' - try: - return fetch_kddcup99(*args, download_if_missing=download_if_missing, - **kwargs) - except IOError: - return None - - -@pytest.mark.skipif(_fetch_dataset() is None, - reason="Download kddcup99 to run this test") -def test_percent10(): - data = _fetch_dataset() +def test_percent10(fetch_kddcup99): + data = fetch_kddcup99() assert data.data.shape == (494021, 41) assert data.target.shape == (494021,) - data_shuffled = _fetch_dataset(shuffle=True, random_state=0) + data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) assert data.data.shape == data_shuffled.data.shape assert data.target.shape == data_shuffled.target.shape - data = _fetch_dataset('SA') + data = fetch_kddcup99('SA') assert data.data.shape == (100655, 41) assert data.target.shape == (100655,) - data = _fetch_dataset('SF') + data = fetch_kddcup99('SF') assert data.data.shape == (73237, 4) assert data.target.shape == (73237,) - data = _fetch_dataset('http') + data = fetch_kddcup99('http') assert data.data.shape == (58725, 3) assert data.target.shape == (58725,) - data = _fetch_dataset('smtp') + data = fetch_kddcup99('smtp') assert data.data.shape == (9571, 3) assert data.target.shape == (9571,) - fetch_func = partial(_fetch_dataset, 'smtp') + fetch_func = partial(fetch_kddcup99, 'smtp') check_return_X_y(data, fetch_func) -@pytest.mark.skipif(_fetch_dataset() is None, - reason="Download kddcup99 to run this test") -def test_shuffle(): - dataset = _fetch_dataset(random_state=0, subset='SA', shuffle=True, +def test_shuffle(fetch_kddcup99): + dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True, percent10=True) assert(any(dataset.target[-100:] == b'normal.')) diff --git a/sklearn/datasets/tests/test_olivetti_faces.py b/sklearn/datasets/tests/test_olivetti_faces.py index 2d7e53815eef1..cb76b1f1e87a8 100644 --- a/sklearn/datasets/tests/test_olivetti_faces.py +++ b/sklearn/datasets/tests/test_olivetti_faces.py @@ -2,33 +2,16 @@ or if specifically requested via environment variable (e.g. for travis cron job).""" -from os import environ -import pytest import numpy as np -from sklearn import datasets from sklearn.utils import Bunch from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.utils._testing import assert_array_equal -def _is_olivetti_faces_not_available(): - # Do not download data, unless explicitly requested via environment var - download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' - try: - datasets.fetch_olivetti_faces(download_if_missing=download_if_missing) - return False - except IOError: - return True - - -@pytest.mark.skipif( - _is_olivetti_faces_not_available(), - reason='Download Olivetti faces dataset to run this test' -) -def test_olivetti_faces(): - data = datasets.fetch_olivetti_faces(shuffle=True, random_state=0) +def test_olivetti_faces(fetch_olivetti_faces): + data = fetch_olivetti_faces(shuffle=True, random_state=0) assert isinstance(data, Bunch) for expected_keys in ('data', 'images', 'target', 'DESCR'): @@ -40,4 +23,4 @@ def test_olivetti_faces(): assert_array_equal(np.unique(np.sort(data.target)), np.arange(40)) # test the return_X_y option - check_return_X_y(data, datasets.fetch_olivetti_faces) + check_return_X_y(data, fetch_olivetti_faces) diff --git a/sklearn/datasets/tests/test_rcv1.py b/sklearn/datasets/tests/test_rcv1.py index bb93051029192..f7ecb0e8c7199 100644 --- a/sklearn/datasets/tests/test_rcv1.py +++ b/sklearn/datasets/tests/test_rcv1.py @@ -2,31 +2,16 @@ or if specifically requested via environment variable (e.g. for travis cron job).""" -from os import environ -import pytest import scipy.sparse as sp import numpy as np from functools import partial -from sklearn.datasets import fetch_rcv1 from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal -def _fetch_data(*args, **kwargs): - # Do not download data, unless explicitly requested via environment var - download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' - try: - return fetch_rcv1(*args, download_if_missing=download_if_missing, - **kwargs) - except IOError: - return None - - -@pytest.mark.skipif(_fetch_data() is None, - reason="Download RCV1 to run this test") -def test_fetch_rcv1(): - data1 = _fetch_data(shuffle=False) +def test_fetch_rcv1(fetch_rcv1): + data1 = fetch_rcv1(shuffle=False) X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id @@ -54,12 +39,12 @@ def test_fetch_rcv1(): assert num == Y1[:, j].data.size # test shuffling and subset - data2 = _fetch_data(shuffle=True, subset='train', random_state=77) + data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option - fetch_func = partial(_fetch_data, shuffle=False, subset='train') + fetch_func = partial(fetch_rcv1, shuffle=False, subset='train') check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples From b1df646681803ef33bb6633d84e4031e9418db17 Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Tue, 25 Feb 2020 19:35:20 +0100 Subject: [PATCH 10/15] Remove pandas hiding fixture [scipy-dev] --- sklearn/datasets/tests/conftest.py | 13 ------------- sklearn/datasets/tests/test_base.py | 1 - .../datasets/tests/test_california_housing.py | 17 ++++++++++------- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py index fddc22dd80c8f..6c23ecf5a9a96 100644 --- a/sklearn/datasets/tests/conftest.py +++ b/sklearn/datasets/tests/conftest.py @@ -63,16 +63,3 @@ def fetch_olivetti_faces(): @pytest.fixture def fetch_rcv1(): return _wrapped_fetch(_fetch_rcv1, dataset_name='rcv1') - - -@pytest.fixture -def hide_available_pandas(monkeypatch): - """ Pretend pandas was not installed. """ - import_orig = builtins.__import__ - - def mocked_import(name, *args, **kwargs): - if name == 'pandas': - raise ImportError() - return import_orig(name, *args, **kwargs) - - monkeypatch.setattr(builtins, '__import__', mocked_import) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index f3e7769d0d0f3..a58bdc9ed644d 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -257,7 +257,6 @@ def test_toy_dataset_as_frame(loader_func, data_dtype, target_dtype): load_linnerud, load_wine, ]) -@pytest.mark.usefixtures('hide_available_pandas') def test_toy_dataset_as_frame_no_pandas(loader_func): check_pandas_dependency_message(loader_func) diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index ced4a20fc902d..b5d2d7ce9d688 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -27,11 +27,14 @@ def test_fetch_asframe(fetch_california_housing): assert isinstance(bunch.target, pd.Series) -@pytest.mark.usefixtures('hide_available_pandas') def test_pandas_dependency_message(fetch_california_housing): - # Check that pandas is imported lazily and that an informative error - # message is raised when pandas is missing: - expected_msg = ('fetch_california_housing with as_frame=True' - ' requires pandas') - with pytest.raises(ImportError, match=expected_msg): - fetch_california_housing(as_frame=True) + try: + import pandas # noqa + pytest.skip("This test requires pandas to be not installed") + except ImportError: + # Check that pandas is imported lazily and that an informative error + # message is raised when pandas is missing: + expected_msg = ('fetch_california_housing with as_frame=True' + ' requires pandas') + with pytest.raises(ImportError, match=expected_msg): + fetch_california_housing(as_frame=True) From 25667a533f02e4fc004daab82550a5e507462b6f Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Tue, 25 Feb 2020 19:42:53 +0100 Subject: [PATCH 11/15] fix flake8 issue [scipy-dev] --- sklearn/datasets/tests/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py index 6c23ecf5a9a96..b5e8f494f4f2e 100644 --- a/sklearn/datasets/tests/conftest.py +++ b/sklearn/datasets/tests/conftest.py @@ -1,6 +1,5 @@ """ Network tests are only run, if data is already locally available, or if download is specifically requested by environment variable.""" -import builtins from os import environ import pytest from sklearn.datasets import ( From cfaedfef57bf5922a539d7236925d5f844cb2e19 Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Thu, 27 Feb 2020 13:34:31 +0100 Subject: [PATCH 12/15] fix name collision [scipy-dev] --- sklearn/datasets/tests/conftest.py | 14 +++++----- sklearn/datasets/tests/test_20news.py | 28 +++++++++---------- .../datasets/tests/test_california_housing.py | 14 +++++----- sklearn/datasets/tests/test_covtype.py | 8 +++--- sklearn/datasets/tests/test_kddcup99.py | 22 +++++++-------- sklearn/datasets/tests/test_olivetti_faces.py | 6 ++-- sklearn/datasets/tests/test_rcv1.py | 8 +++--- 7 files changed, 50 insertions(+), 50 deletions(-) diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py index b5e8f494f4f2e..9952fa05386a0 100644 --- a/sklearn/datasets/tests/conftest.py +++ b/sklearn/datasets/tests/conftest.py @@ -28,37 +28,37 @@ def wrapped(*args, **kwargs): @pytest.fixture -def fetch_20newsgroups(): +def fetch_20newsgroups_fxt(): return _wrapped_fetch(_fetch_20newsgroups, dataset_name='20newsgroups') @pytest.fixture -def fetch_20newsgroups_vectorized(): +def fetch_20newsgroups_vectorized_fxt(): return _wrapped_fetch(_fetch_20newsgroups_vectorized, dataset_name='20newsgroups_vectorized') @pytest.fixture -def fetch_california_housing(): +def fetch_california_housing_fxt(): return _wrapped_fetch(_fetch_california_housing, dataset_name='california_housing') @pytest.fixture -def fetch_covtype(): +def fetch_covtype_fxt(): return _wrapped_fetch(_fetch_covtype, dataset_name='covtype') @pytest.fixture -def fetch_kddcup99(): +def fetch_kddcup99_fxt(): return _wrapped_fetch(_fetch_kddcup99, dataset_name='kddcup99') @pytest.fixture -def fetch_olivetti_faces(): +def fetch_olivetti_faces_fxt(): return _wrapped_fetch(_fetch_olivetti_faces, dataset_name='olivetti_faces') @pytest.fixture -def fetch_rcv1(): +def fetch_rcv1_fxt(): return _wrapped_fetch(_fetch_rcv1, dataset_name='rcv1') diff --git a/sklearn/datasets/tests/test_20news.py b/sklearn/datasets/tests/test_20news.py index d56d05579e311..f800a49238ec1 100644 --- a/sklearn/datasets/tests/test_20news.py +++ b/sklearn/datasets/tests/test_20news.py @@ -11,11 +11,11 @@ from sklearn.preprocessing import normalize -def test_20news(fetch_20newsgroups): - data = fetch_20newsgroups(subset='all', shuffle=False) +def test_20news(fetch_20newsgroups_fxt): + data = fetch_20newsgroups_fxt(subset='all', shuffle=False) # Extract a reduced dataset - data2cats = fetch_20newsgroups( + data2cats = fetch_20newsgroups_fxt( subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset @@ -36,53 +36,53 @@ def test_20news(fetch_20newsgroups): assert entry1 == entry2 # check that return_X_y option - X, y = fetch_20newsgroups(subset='all', shuffle=False, return_X_y=True) + X, y = fetch_20newsgroups_fxt(subset='all', shuffle=False, return_X_y=True) assert len(X) == len(data.data) assert y.shape == data.target.shape -def test_20news_length_consistency(fetch_20newsgroups): +def test_20news_length_consistency(fetch_20newsgroups_fxt): """Checks the length consistencies within the bunch This is a non-regression test for a bug present in 0.16.1. """ # Extract the full dataset - data = fetch_20newsgroups(subset='all') + data = fetch_20newsgroups_fxt(subset='all') assert len(data['data']) == len(data.data) assert len(data['target']) == len(data.target) assert len(data['filenames']) == len(data.filenames) -def test_20news_vectorized(fetch_20newsgroups_vectorized): +def test_20news_vectorized(fetch_20newsgroups_vectorized_fxt): # test subset = train - bunch = fetch_20newsgroups_vectorized(subset="train") + bunch = fetch_20newsgroups_vectorized_fxt(subset="train") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314, 130107) assert bunch.target.shape[0] == 11314 assert bunch.data.dtype == np.float64 # test subset = test - bunch = fetch_20newsgroups_vectorized(subset="test") + bunch = fetch_20newsgroups_vectorized_fxt(subset="test") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (7532, 130107) assert bunch.target.shape[0] == 7532 assert bunch.data.dtype == np.float64 # test return_X_y option - fetch_func = partial(fetch_20newsgroups_vectorized, subset='test') + fetch_func = partial(fetch_20newsgroups_vectorized_fxt, subset='test') check_return_X_y(bunch, fetch_func) # test subset = all - bunch = fetch_20newsgroups_vectorized(subset='all') + bunch = fetch_20newsgroups_vectorized_fxt(subset='all') assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314 + 7532, 130107) assert bunch.target.shape[0] == 11314 + 7532 assert bunch.data.dtype == np.float64 -def test_20news_normalization(fetch_20newsgroups_vectorized): - X = fetch_20newsgroups_vectorized(normalize=False) - X_ = fetch_20newsgroups_vectorized(normalize=True) +def test_20news_normalization(fetch_20newsgroups_vectorized_fxt): + X = fetch_20newsgroups_vectorized_fxt(normalize=False) + X_ = fetch_20newsgroups_vectorized_fxt(normalize=True) X_norm = X_['data'][:100] X = X['data'][:100] diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index b5d2d7ce9d688..af1e1ff1370e1 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -7,19 +7,19 @@ from functools import partial -def test_fetch(fetch_california_housing): - data = fetch_california_housing() +def test_fetch(fetch_california_housing_fxt): + data = fetch_california_housing_fxt() assert((20640, 8) == data.data.shape) assert((20640, ) == data.target.shape) # test return_X_y option - fetch_func = partial(fetch_california_housing) + fetch_func = partial(fetch_california_housing_fxt) check_return_X_y(data, fetch_func) -def test_fetch_asframe(fetch_california_housing): +def test_fetch_asframe(fetch_california_housing_fxt): pd = pytest.importorskip('pandas') - bunch = fetch_california_housing(as_frame=True) + bunch = fetch_california_housing_fxt(as_frame=True) frame = bunch.frame assert hasattr(bunch, 'frame') is True assert frame.shape == (20640, 9) @@ -27,7 +27,7 @@ def test_fetch_asframe(fetch_california_housing): assert isinstance(bunch.target, pd.Series) -def test_pandas_dependency_message(fetch_california_housing): +def test_pandas_dependency_message(fetch_california_housing_fxt): try: import pandas # noqa pytest.skip("This test requires pandas to be not installed") @@ -37,4 +37,4 @@ def test_pandas_dependency_message(fetch_california_housing): expected_msg = ('fetch_california_housing with as_frame=True' ' requires pandas') with pytest.raises(ImportError, match=expected_msg): - fetch_california_housing(as_frame=True) + fetch_california_housing_fxt(as_frame=True) diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index 1b127b0100f1f..d966e6c3890d0 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -6,9 +6,9 @@ from functools import partial -def test_fetch(fetch_covtype): - data1 = fetch_covtype(shuffle=True, random_state=42) - data2 = fetch_covtype(shuffle=True, random_state=37) +def test_fetch(fetch_covtype_fxt): + data1 = fetch_covtype_fxt(shuffle=True, random_state=42) + data2 = fetch_covtype_fxt(shuffle=True, random_state=37) X1, X2 = data1['data'], data2['data'] assert (581012, 54) == X1.shape @@ -21,5 +21,5 @@ def test_fetch(fetch_covtype): assert (X1.shape[0],) == y2.shape # test return_X_y option - fetch_func = partial(fetch_covtype) + fetch_func = partial(fetch_covtype_fxt) check_return_X_y(data1, fetch_func) diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index f0827290ce2e8..899abd2bcb153 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -10,37 +10,37 @@ from functools import partial -def test_percent10(fetch_kddcup99): - data = fetch_kddcup99() +def test_percent10(fetch_kddcup99_fxt): + data = fetch_kddcup99_fxt() assert data.data.shape == (494021, 41) assert data.target.shape == (494021,) - data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) + data_shuffled = fetch_kddcup99_fxt(shuffle=True, random_state=0) assert data.data.shape == data_shuffled.data.shape assert data.target.shape == data_shuffled.target.shape - data = fetch_kddcup99('SA') + data = fetch_kddcup99_fxt('SA') assert data.data.shape == (100655, 41) assert data.target.shape == (100655,) - data = fetch_kddcup99('SF') + data = fetch_kddcup99_fxt('SF') assert data.data.shape == (73237, 4) assert data.target.shape == (73237,) - data = fetch_kddcup99('http') + data = fetch_kddcup99_fxt('http') assert data.data.shape == (58725, 3) assert data.target.shape == (58725,) - data = fetch_kddcup99('smtp') + data = fetch_kddcup99_fxt('smtp') assert data.data.shape == (9571, 3) assert data.target.shape == (9571,) - fetch_func = partial(fetch_kddcup99, 'smtp') + fetch_func = partial(fetch_kddcup99_fxt, 'smtp') check_return_X_y(data, fetch_func) -def test_shuffle(fetch_kddcup99): - dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True, - percent10=True) +def test_shuffle(fetch_kddcup99_fxt): + dataset = fetch_kddcup99_fxt(random_state=0, subset='SA', shuffle=True, + percent10=True) assert(any(dataset.target[-100:] == b'normal.')) diff --git a/sklearn/datasets/tests/test_olivetti_faces.py b/sklearn/datasets/tests/test_olivetti_faces.py index cb76b1f1e87a8..f0c7aa1216e76 100644 --- a/sklearn/datasets/tests/test_olivetti_faces.py +++ b/sklearn/datasets/tests/test_olivetti_faces.py @@ -10,8 +10,8 @@ from sklearn.utils._testing import assert_array_equal -def test_olivetti_faces(fetch_olivetti_faces): - data = fetch_olivetti_faces(shuffle=True, random_state=0) +def test_olivetti_faces(fetch_olivetti_faces_fxt): + data = fetch_olivetti_faces_fxt(shuffle=True, random_state=0) assert isinstance(data, Bunch) for expected_keys in ('data', 'images', 'target', 'DESCR'): @@ -23,4 +23,4 @@ def test_olivetti_faces(fetch_olivetti_faces): assert_array_equal(np.unique(np.sort(data.target)), np.arange(40)) # test the return_X_y option - check_return_X_y(data, fetch_olivetti_faces) + check_return_X_y(data, fetch_olivetti_faces_fxt) diff --git a/sklearn/datasets/tests/test_rcv1.py b/sklearn/datasets/tests/test_rcv1.py index f7ecb0e8c7199..2c21201dce40e 100644 --- a/sklearn/datasets/tests/test_rcv1.py +++ b/sklearn/datasets/tests/test_rcv1.py @@ -10,8 +10,8 @@ from sklearn.utils._testing import assert_array_equal -def test_fetch_rcv1(fetch_rcv1): - data1 = fetch_rcv1(shuffle=False) +def test_fetch_rcv1(fetch_rcv1_fxt): + data1 = fetch_rcv1_fxt(shuffle=False) X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id @@ -39,12 +39,12 @@ def test_fetch_rcv1(fetch_rcv1): assert num == Y1[:, j].data.size # test shuffling and subset - data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77) + data2 = fetch_rcv1_fxt(shuffle=True, subset='train', random_state=77) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option - fetch_func = partial(fetch_rcv1, shuffle=False, subset='train') + fetch_func = partial(fetch_rcv1_fxt, shuffle=False, subset='train') check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples From dce862c8390e9cf555e4f51000705c6c5e05109a Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 1 Mar 2020 11:31:07 -0500 Subject: [PATCH 13/15] BLD [scipy-dev] From df87928feaf81ebf02c00be937806e2c764c51db Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 1 Mar 2020 21:38:06 -0500 Subject: [PATCH 14/15] STY Removes prefix from fetch_* --- sklearn/datasets/tests/conftest.py | 31 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py index 9952fa05386a0..85242d7335685 100644 --- a/sklearn/datasets/tests/conftest.py +++ b/sklearn/datasets/tests/conftest.py @@ -2,15 +2,13 @@ or if download is specifically requested by environment variable.""" from os import environ import pytest -from sklearn.datasets import ( - fetch_20newsgroups as _fetch_20newsgroups, - fetch_20newsgroups_vectorized as _fetch_20newsgroups_vectorized, - fetch_california_housing as _fetch_california_housing, - fetch_covtype as _fetch_covtype, - fetch_kddcup99 as _fetch_kddcup99, - fetch_olivetti_faces as _fetch_olivetti_faces, - fetch_rcv1 as _fetch_rcv1, -) +from sklearn.datasets import fetch_20newsgroups +from sklearn.datasets import fetch_20newsgroups_vectorized +from sklearn.datasets import fetch_california_housing +from sklearn.datasets import fetch_covtype +from sklearn.datasets import fetch_kddcup99 +from sklearn.datasets import fetch_olivetti_faces +from sklearn.datasets import fetch_rcv1 def _wrapped_fetch(f, dataset_name): @@ -23,42 +21,41 @@ def wrapped(*args, **kwargs): return f(*args, **kwargs) except IOError: pytest.skip("Download {} to run this test".format(dataset_name)) - return wrapped @pytest.fixture def fetch_20newsgroups_fxt(): - return _wrapped_fetch(_fetch_20newsgroups, dataset_name='20newsgroups') + return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups') @pytest.fixture def fetch_20newsgroups_vectorized_fxt(): - return _wrapped_fetch(_fetch_20newsgroups_vectorized, + return _wrapped_fetch(fetch_20newsgroups_vectorized, dataset_name='20newsgroups_vectorized') @pytest.fixture def fetch_california_housing_fxt(): - return _wrapped_fetch(_fetch_california_housing, + return _wrapped_fetch(fetch_california_housing, dataset_name='california_housing') @pytest.fixture def fetch_covtype_fxt(): - return _wrapped_fetch(_fetch_covtype, dataset_name='covtype') + return _wrapped_fetch(fetch_covtype, dataset_name='covtype') @pytest.fixture def fetch_kddcup99_fxt(): - return _wrapped_fetch(_fetch_kddcup99, dataset_name='kddcup99') + return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99') @pytest.fixture def fetch_olivetti_faces_fxt(): - return _wrapped_fetch(_fetch_olivetti_faces, dataset_name='olivetti_faces') + return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces') @pytest.fixture def fetch_rcv1_fxt(): - return _wrapped_fetch(_fetch_rcv1, dataset_name='rcv1') + return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1') From 089890f71c34c76e2b41a7214ce9cdaf758e86f2 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 1 Mar 2020 21:39:08 -0500 Subject: [PATCH 15/15] BLD [scipy-dev]