From e34aa0ea0223205261d2852f8435de493a9e91f9 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 13 Jan 2020 13:59:06 +0100 Subject: [PATCH] remoe _refresh_cache --- sklearn/datasets/_base.py | 28 ------------ sklearn/datasets/_california_housing.py | 5 +- sklearn/datasets/_covtype.py | 7 +-- sklearn/datasets/_kddcup99.py | 7 +-- sklearn/datasets/_olivetti_faces.py | 5 +- sklearn/datasets/_rcv1.py | 13 ++---- sklearn/datasets/_species_distributions.py | 5 +- sklearn/datasets/tests/test_base.py | 53 ---------------------- 8 files changed, 11 insertions(+), 112 deletions(-) diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index bdb5cfe8772ca..62bffb947a8ff 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -919,31 +919,3 @@ def _fetch_remote(remote, dirname=None): "file may be corrupted.".format(file_path, checksum, remote.checksum)) return file_path - - -def _refresh_cache(files, compress): - # TODO: REMOVE in v0.23 - import joblib - msg = "sklearn.externals.joblib is deprecated in 0.21" - with warnings.catch_warnings(record=True) as warns: - data = tuple([joblib.load(f) for f in files]) - - refresh_needed = any([str(x.message).startswith(msg) for x in warns]) - - other_warns = [w for w in warns if not str(w.message).startswith(msg)] - for w in other_warns: - warnings.warn(message=w.message, category=w.category) - - if refresh_needed: - try: - for value, path in zip(data, files): - joblib.dump(value, path, compress=compress) - except IOError: - message = ("This dataset will stop being loadable in scikit-learn " - "version 0.23 because it references a deprecated " - "import path. Consider removing the following files " - "and allowing it to be cached anew:\n%s" - % ("\n".join(files))) - warnings.warn(message=message, category=FutureWarning) - - return data[0] if len(data) == 1 else data diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py index c71ebf3871b75..958184369b63d 100644 --- a/sklearn/datasets/_california_housing.py +++ b/sklearn/datasets/_california_housing.py @@ -35,7 +35,6 @@ from ._base import _fetch_remote from ._base import _pkl_filepath from ._base import RemoteFileMetadata -from ._base import _refresh_cache from ..utils import Bunch # The original data can be found at: @@ -146,9 +145,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True, remove(archive_path) else: - cal_housing = _refresh_cache([filepath], 6) - # TODO: Revert to the following line in v0.23 - # cal_housing = joblib.load(filepath) + cal_housing = joblib.load(filepath) feature_names = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"] diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index f9fab853adc84..367ec1f9e2970 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -25,7 +25,6 @@ from . import get_data_home from ._base import _fetch_remote from ._base import RemoteFileMetadata -from ._base import _refresh_cache from ..utils import Bunch from ._base import _pkl_filepath from ..utils import check_random_state @@ -126,10 +125,8 @@ def fetch_covtype(data_home=None, download_if_missing=True, try: X, y except NameError: - X, y = _refresh_cache([samples_path, targets_path], 9) - # TODO: Revert to the following two lines in v0.23 - # X = joblib.load(samples_path) - # y = joblib.load(targets_path) + X = joblib.load(samples_path) + y = joblib.load(targets_path) if shuffle: ind = np.arange(X.shape[0]) diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index 0a8121521ac82..4585df8b0fb8b 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -20,7 +20,6 @@ from ._base import _fetch_remote from . import get_data_home from ._base import RemoteFileMetadata -from ._base import _refresh_cache from ..utils import Bunch from ..utils import check_random_state from ..utils import shuffle as shuffle_method @@ -293,10 +292,8 @@ def _fetch_brute_kddcup99(data_home=None, try: X, y except NameError: - X, y = _refresh_cache([samples_path, targets_path], 0) - # TODO: Revert to the following two lines in v0.23 - # X = joblib.load(samples_path) - # y = joblib.load(targets_path) + X = joblib.load(samples_path) + y = joblib.load(targets_path) return Bunch(data=X, target=y) diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py index f88f088e82e8b..d1a9805b495f2 100644 --- a/sklearn/datasets/_olivetti_faces.py +++ b/sklearn/datasets/_olivetti_faces.py @@ -24,7 +24,6 @@ from ._base import _fetch_remote from ._base import RemoteFileMetadata from ._base import _pkl_filepath -from ._base import _refresh_cache from ..utils import check_random_state, Bunch # The original data can be found at: @@ -110,9 +109,7 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0, joblib.dump(faces, filepath, compress=6) del mfile else: - faces = _refresh_cache([filepath], 6) - # TODO: Revert to the following line in v0.23 - # faces = joblib.load(filepath) + faces = joblib.load(filepath) # We want floating point data, but float32 is enough (there is only # one byte of precision in the original uint8s anyway) diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py index 0836fe1249271..d930a347b7f7c 100644 --- a/sklearn/datasets/_rcv1.py +++ b/sklearn/datasets/_rcv1.py @@ -22,7 +22,6 @@ from ._base import _pkl_filepath from ._base import _fetch_remote from ._base import RemoteFileMetadata -from ._base import _refresh_cache from ._svmlight_format_io import load_svmlight_files from ..utils import shuffle as shuffle_ from ..utils import Bunch @@ -190,10 +189,8 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, f.close() remove(f.name) else: - X, sample_id = _refresh_cache([samples_path, sample_id_path], 9) - # TODO: Revert to the following two lines in v0.23 - # X = joblib.load(samples_path) - # sample_id = joblib.load(sample_id_path) + X = joblib.load(samples_path) + sample_id = joblib.load(sample_id_path) # load target (y), categories, and sample_id_bis if download_if_missing and (not exists(sample_topics_path) or @@ -246,10 +243,8 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, joblib.dump(y, sample_topics_path, compress=9) joblib.dump(categories, topics_path, compress=9) else: - y, categories = _refresh_cache([sample_topics_path, topics_path], 9) - # TODO: Revert to the following two lines in v0.23 - # y = joblib.load(sample_topics_path) - # categories = joblib.load(topics_path) + y = joblib.load(sample_topics_path) + categories = joblib.load(topics_path) if subset == 'all': pass diff --git a/sklearn/datasets/_species_distributions.py b/sklearn/datasets/_species_distributions.py index 99dc192af755b..7800dfce2c190 100644 --- a/sklearn/datasets/_species_distributions.py +++ b/sklearn/datasets/_species_distributions.py @@ -51,7 +51,6 @@ from ._base import RemoteFileMetadata from ..utils import Bunch from ._base import _pkl_filepath -from ._base import _refresh_cache # The original data can be found at: # https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip @@ -260,8 +259,6 @@ def fetch_species_distributions(data_home=None, **extra_params) joblib.dump(bunch, archive_path, compress=9) else: - bunch = _refresh_cache([archive_path], 9) - # TODO: Revert to the following line in v0.23 - # bunch = joblib.load(archive_path) + bunch = joblib.load(archive_path) return bunch diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 3a0ad41ced969..7f56217e93455 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -24,7 +24,6 @@ from sklearn.datasets import load_boston from sklearn.datasets import load_wine from sklearn.utils import Bunch -from sklearn.datasets._base import _refresh_cache from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.externals._pilutil import pillow_installed @@ -277,55 +276,3 @@ def test_bunch_dir(): # check that dir (important for autocomplete) shows attributes data = load_iris() assert "data" in dir(data) - - -def test_refresh_cache(monkeypatch): - # uses pytests monkeypatch fixture - # https://docs.pytest.org/en/latest/monkeypatch.html - - def _load_warn(*args, **kwargs): - # raise the warning from "externals.joblib.__init__.py" - # this is raised when a file persisted by the old joblib is loaded now - msg = ("sklearn.externals.joblib is deprecated in 0.21 and will be " - "removed in 0.23. Please import this functionality directly " - "from joblib, which can be installed with: pip install joblib. " - "If this warning is raised when loading pickled models, you " - "may need to re-serialize those models with scikit-learn " - "0.21+.") - warnings.warn(msg, FutureWarning) - return 0 - - def _load_warn_unrelated(*args, **kwargs): - warnings.warn("unrelated warning", FutureWarning) - return 0 - - def _dump_safe(*args, **kwargs): - pass - - def _dump_raise(*args, **kwargs): - # this happens if the file is read-only and joblib.dump fails to write - # on it. - raise IOError() - - # test if the dataset spesific warning is raised if load raises the joblib - # warning, and dump fails to dump with new joblib - monkeypatch.setattr(joblib, "load", _load_warn) - monkeypatch.setattr(joblib, "dump", _dump_raise) - msg = "This dataset will stop being loadable in scikit-learn" - with pytest.warns(FutureWarning, match=msg): - _refresh_cache('test', 0) - - # make sure no warning is raised if load raises the warning, but dump - # manages to dump the new data - monkeypatch.setattr(joblib, "load", _load_warn) - monkeypatch.setattr(joblib, "dump", _dump_safe) - with pytest.warns(None) as warns: - _refresh_cache('test', 0) - assert len(warns) == 0 - - # test if an unrelated warning is still passed through and not suppressed - # by _refresh_cache - monkeypatch.setattr(joblib, "load", _load_warn_unrelated) - monkeypatch.setattr(joblib, "dump", _dump_safe) - with pytest.warns(FutureWarning, match="unrelated warning"): - _refresh_cache('test', 0)