From 20fac4fe412970db70b17f42b5287de4a84c44d3 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 26 Jun 2019 15:11:08 +0200 Subject: [PATCH 01/18] introduce refresh_cache param to fetch_data --- sklearn/datasets/base.py | 23 +++++++++++++++++++++++ sklearn/datasets/covtype.py | 19 ++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 0b8f73c86117b..e60d4c87d63e5 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -10,6 +10,7 @@ import csv import sys import shutil +import warnings from collections import namedtuple from os import environ, listdir, makedirs from os.path import dirname, exists, expanduser, isdir, join, splitext @@ -919,3 +920,25 @@ def _fetch_remote(remote, dirname=None): "file may be corrupted.".format(file_path, checksum, remote.checksum)) return file_path + + +def _refresh_cache(path, refresh_cache): + if not refresh_cache: + return + + if refresh_cache == True: + shutil.rmtree(path) + return + + import joblib + samples_path = _pkl_filepath(path, "samples") + targets_path = _pkl_filepath(path, "targets") + msg = "sklearn.externals.joblib is deprecated in 0.21" + with warnings.catch_warnings(record=True) as warns: + _ = joblib.load(samples_path) + _ = joblib.load(targets_path) + + refresh_needed = any([str(x.message).startswith(msg) for x in warns]) + + if refresh_needed: + shutil.rmtree(path) diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py index 9d995810bee3f..c89e7fdf88798 100644 --- a/sklearn/datasets/covtype.py +++ b/sklearn/datasets/covtype.py @@ -25,6 +25,7 @@ from .base import get_data_home from .base import _fetch_remote from .base import RemoteFileMetadata +from .base import _refresh_cache from ..utils import Bunch from .base import _pkl_filepath from ..utils import check_random_state @@ -41,7 +42,8 @@ def fetch_covtype(data_home=None, download_if_missing=True, - random_state=None, shuffle=False, return_X_y=False): + random_state=None, shuffle=False, return_X_y=False, + refresh_cache='joblib'): """Load the covertype dataset (classification). Download it if necessary. @@ -79,6 +81,17 @@ def fetch_covtype(data_home=None, download_if_missing=True, .. versionadded:: 0.20 + refresh_cache : str or bool, optional (default='joblib') + - ``True``: remove the previously downloaded data, and fetche it again. + - ``'joblib'``: only re-fetch the data if the previously downloaded + data has been persisted using the previously vendored `joblib`. + - ``False``: do not re-fetch the data. + + From version 0.23, ``'joblib'`` as an input value will be ignored and + assumed ``False``. + + .. versionadded:: 0.21.3 + Returns ------- dataset : dict-like object with the following attributes: @@ -96,12 +109,16 @@ def fetch_covtype(data_home=None, download_if_missing=True, (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.20 + """ data_home = get_data_home(data_home=data_home) covtype_dir = join(data_home, "covertype") samples_path = _pkl_filepath(covtype_dir, "samples") targets_path = _pkl_filepath(covtype_dir, "targets") + + _refresh_cache(covtype_dir, refresh_cache) + available = exists(samples_path) if download_if_missing and not available: From 983da6c2f0f15e13a60361484e8394870b1416a3 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 26 Jun 2019 15:16:26 +0200 Subject: [PATCH 02/18] pep8 --- sklearn/datasets/base.py | 2 +- sklearn/datasets/covtype.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index e60d4c87d63e5..cdbb4938846d4 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -926,7 +926,7 @@ def _refresh_cache(path, refresh_cache): if not refresh_cache: return - if refresh_cache == True: + if refresh_cache is True: shutil.rmtree(path) return diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py index c89e7fdf88798..e22e1ab0830d6 100644 --- a/sklearn/datasets/covtype.py +++ b/sklearn/datasets/covtype.py @@ -86,7 +86,7 @@ def fetch_covtype(data_home=None, download_if_missing=True, - ``'joblib'``: only re-fetch the data if the previously downloaded data has been persisted using the previously vendored `joblib`. - ``False``: do not re-fetch the data. - + From version 0.23, ``'joblib'`` as an input value will be ignored and assumed ``False``. From ef7160c9cd8e493009abaab6ff748e52bc05e528 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 26 Jun 2019 15:19:42 +0200 Subject: [PATCH 03/18] remove irrelevant line --- sklearn/datasets/covtype.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py index e22e1ab0830d6..fad123de27a7a 100644 --- a/sklearn/datasets/covtype.py +++ b/sklearn/datasets/covtype.py @@ -109,7 +109,6 @@ def fetch_covtype(data_home=None, download_if_missing=True, (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.20 - """ data_home = get_data_home(data_home=data_home) From 0545b305192427959bf3b888491caa4b177008b4 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 27 Jun 2019 14:59:22 +0200 Subject: [PATCH 04/18] do not re-download --- sklearn/datasets/base.py | 22 +++++++++++----------- sklearn/datasets/covtype.py | 20 ++++---------------- 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index cdbb4938846d4..f6c4bf76de58a 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -922,23 +922,23 @@ def _fetch_remote(remote, dirname=None): return file_path -def _refresh_cache(path, refresh_cache): - if not refresh_cache: - return - - if refresh_cache is True: - shutil.rmtree(path) - return - +def _refresh_cache(path): + # REMOVE in v0.23 import joblib samples_path = _pkl_filepath(path, "samples") targets_path = _pkl_filepath(path, "targets") msg = "sklearn.externals.joblib is deprecated in 0.21" with warnings.catch_warnings(record=True) as warns: - _ = joblib.load(samples_path) - _ = joblib.load(targets_path) + X = joblib.load(samples_path) + y = joblib.load(targets_path) refresh_needed = any([str(x.message).startswith(msg) for x in warns]) if refresh_needed: - shutil.rmtree(path) + try: + joblib.dump(X, samples_path, compress=9) + joblib.dump(y, samples_path, compress=9) + except IOError: + pass + + return X, y \ No newline at end of file diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py index fad123de27a7a..8c8f97f0b7b61 100644 --- a/sklearn/datasets/covtype.py +++ b/sklearn/datasets/covtype.py @@ -81,17 +81,6 @@ def fetch_covtype(data_home=None, download_if_missing=True, .. versionadded:: 0.20 - refresh_cache : str or bool, optional (default='joblib') - - ``True``: remove the previously downloaded data, and fetche it again. - - ``'joblib'``: only re-fetch the data if the previously downloaded - data has been persisted using the previously vendored `joblib`. - - ``False``: do not re-fetch the data. - - From version 0.23, ``'joblib'`` as an input value will be ignored and - assumed ``False``. - - .. versionadded:: 0.21.3 - Returns ------- dataset : dict-like object with the following attributes: @@ -115,9 +104,6 @@ def fetch_covtype(data_home=None, download_if_missing=True, covtype_dir = join(data_home, "covertype") samples_path = _pkl_filepath(covtype_dir, "samples") targets_path = _pkl_filepath(covtype_dir, "targets") - - _refresh_cache(covtype_dir, refresh_cache) - available = exists(samples_path) if download_if_missing and not available: @@ -141,8 +127,10 @@ def fetch_covtype(data_home=None, download_if_missing=True, try: X, y except NameError: - X = joblib.load(samples_path) - y = joblib.load(targets_path) + X, y = _refresh_cache(covtype_dir) + # Revert to the following two lines in v0.23 + # X = joblib.load(samples_path) + # y = joblib.load(targets_path) if shuffle: ind = np.arange(X.shape[0]) From 21a19622e54d767bce2dbc4711e7d76b6c865b5f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 27 Jun 2019 15:00:14 +0200 Subject: [PATCH 05/18] pep8 --- sklearn/datasets/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index f6c4bf76de58a..f519862b72d56 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -941,4 +941,4 @@ def _refresh_cache(path): except IOError: pass - return X, y \ No newline at end of file + return X, y From 65671ac58f30e23f1464affce332627108f34475 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 27 Jun 2019 15:01:01 +0200 Subject: [PATCH 06/18] remove param --- sklearn/datasets/covtype.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py index 8c8f97f0b7b61..24a9c8dfc108f 100644 --- a/sklearn/datasets/covtype.py +++ b/sklearn/datasets/covtype.py @@ -42,8 +42,7 @@ def fetch_covtype(data_home=None, download_if_missing=True, - random_state=None, shuffle=False, return_X_y=False, - refresh_cache='joblib'): + random_state=None, shuffle=False, return_X_y=False): """Load the covertype dataset (classification). Download it if necessary. From 4b5fa967037e9ded8adeec4553760dee327da93b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 27 Jun 2019 15:54:10 +0200 Subject: [PATCH 07/18] raise warnings --- sklearn/datasets/base.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index f519862b72d56..b4793f79ce7d0 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -941,4 +941,15 @@ def _refresh_cache(path): except IOError: pass + other_warns = [w for w in warns if not str(w.message).startswith(msg)] + joblib_warning = [w for w in warns + if str(w.message).startswith(msg)][0] + + for w in other_warns: + warnings.warn(message=w.message, category=w.category) + + message = str(joblib_warning.message) + ( + " The persisted files are located under: %s" % path) + warnings.warn(message=message, category=joblib_warning.category) + return X, y From f9b34a7b1bbae9ed4f256de16e060059406d1a49 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 27 Jun 2019 15:55:13 +0200 Subject: [PATCH 08/18] raise warnings, when needed --- sklearn/datasets/base.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index b4793f79ce7d0..c4f5163a388b1 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -935,21 +935,22 @@ def _refresh_cache(path): refresh_needed = any([str(x.message).startswith(msg) for x in warns]) if refresh_needed: + raise_joblib = False try: joblib.dump(X, samples_path, compress=9) joblib.dump(y, samples_path, compress=9) except IOError: - pass + raise_joblib = True other_warns = [w for w in warns if not str(w.message).startswith(msg)] - joblib_warning = [w for w in warns - if str(w.message).startswith(msg)][0] - for w in other_warns: warnings.warn(message=w.message, category=w.category) - message = str(joblib_warning.message) + ( - " The persisted files are located under: %s" % path) - warnings.warn(message=message, category=joblib_warning.category) + if raise_joblib: + joblib_warning = [w for w in warns + if str(w.message).startswith(msg)][0] + message = str(joblib_warning.message) + ( + " The persisted files are located under: %s" % path) + warnings.warn(message=message, category=joblib_warning.category) return X, y From 5baba644033478972c709204b66d4547bba7103f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 27 Jun 2019 23:09:51 +0200 Subject: [PATCH 09/18] fix the other fetch_... instances --- sklearn/datasets/base.py | 18 +++++++++--------- sklearn/datasets/california_housing.py | 5 ++++- sklearn/datasets/covtype.py | 2 +- sklearn/datasets/kddcup99.py | 7 +++++-- sklearn/datasets/olivetti_faces.py | 5 ++++- sklearn/datasets/rcv1.py | 13 +++++++++---- sklearn/datasets/species_distributions.py | 5 ++++- 7 files changed, 36 insertions(+), 19 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index c4f5163a388b1..09bf03925e3ad 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -922,23 +922,20 @@ def _fetch_remote(remote, dirname=None): return file_path -def _refresh_cache(path): +def _refresh_cache(files, compress): # REMOVE in v0.23 import joblib - samples_path = _pkl_filepath(path, "samples") - targets_path = _pkl_filepath(path, "targets") msg = "sklearn.externals.joblib is deprecated in 0.21" with warnings.catch_warnings(record=True) as warns: - X = joblib.load(samples_path) - y = joblib.load(targets_path) + data = tuple([joblib.load(f) for f in files]) - refresh_needed = any([str(x.message).startswith(msg) for x in warns]) + refresh_needed = any([str(x.message).startswith(msg) for x in warns]) if refresh_needed: raise_joblib = False try: - joblib.dump(X, samples_path, compress=9) - joblib.dump(y, samples_path, compress=9) + for value, path in zip(data, files): + joblib.dump(value, path, compress=compress) except IOError: raise_joblib = True @@ -953,4 +950,7 @@ def _refresh_cache(path): " The persisted files are located under: %s" % path) warnings.warn(message=message, category=joblib_warning.category) - return X, y + if len(data) == 1: + return data[0] + else: + return data diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py index 35f0847c1de05..1ec001277020c 100644 --- a/sklearn/datasets/california_housing.py +++ b/sklearn/datasets/california_housing.py @@ -34,6 +34,7 @@ from .base import _fetch_remote from .base import _pkl_filepath from .base import RemoteFileMetadata +from .base import _refresh_cache from ..utils import Bunch # The original data can be found at: @@ -129,7 +130,9 @@ def fetch_california_housing(data_home=None, download_if_missing=True, remove(archive_path) else: - cal_housing = joblib.load(filepath) + cal_housing = _refresh_cache([filepath], 6) + # Revert to the following two lines in v0.23 + # cal_housing = joblib.load(filepath) feature_names = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"] diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py index 24a9c8dfc108f..faa521dd03187 100644 --- a/sklearn/datasets/covtype.py +++ b/sklearn/datasets/covtype.py @@ -126,7 +126,7 @@ def fetch_covtype(data_home=None, download_if_missing=True, try: X, y except NameError: - X, y = _refresh_cache(covtype_dir) + X, y = _refresh_cache([samples_path, targets_path], 9) # Revert to the following two lines in v0.23 # X = joblib.load(samples_path) # y = joblib.load(targets_path) diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py index 837a489e7212c..dd3653dfe0083 100644 --- a/sklearn/datasets/kddcup99.py +++ b/sklearn/datasets/kddcup99.py @@ -20,6 +20,7 @@ from .base import _fetch_remote from .base import get_data_home from .base import RemoteFileMetadata +from .base import _refresh_cache from ..utils import Bunch from ..utils import check_random_state from ..utils import shuffle as shuffle_method @@ -292,8 +293,10 @@ def _fetch_brute_kddcup99(data_home=None, try: X, y except NameError: - X = joblib.load(samples_path) - y = joblib.load(targets_path) + X, y = _refresh_cache([samples_path, targets_path], 0) + # Revert to the following two lines in v0.23 + # X = joblib.load(samples_path) + # y = joblib.load(targets_path) return Bunch(data=X, target=y) diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py index a52f90414e104..52503554c485b 100644 --- a/sklearn/datasets/olivetti_faces.py +++ b/sklearn/datasets/olivetti_faces.py @@ -24,6 +24,7 @@ from .base import _fetch_remote from .base import RemoteFileMetadata from .base import _pkl_filepath +from .base import _refresh_cache from ..utils import check_random_state, Bunch # The original data can be found at: @@ -107,7 +108,9 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0, joblib.dump(faces, filepath, compress=6) del mfile else: - faces = joblib.load(filepath) + faces = _refresh_cache([filepath], 6) + # Revert to the following two lines in v0.23 + # faces = joblib.load(filepath) # We want floating point data, but float32 is enough (there is only # one byte of precision in the original uint8s anyway) diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py index c95cf1d1be75a..b5f0a47065168 100644 --- a/sklearn/datasets/rcv1.py +++ b/sklearn/datasets/rcv1.py @@ -22,6 +22,7 @@ from .base import _pkl_filepath from .base import _fetch_remote from .base import RemoteFileMetadata +from .base import _refresh_cache from .svmlight_format import load_svmlight_files from ..utils import shuffle as shuffle_ from ..utils import Bunch @@ -189,8 +190,10 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, f.close() remove(f.name) else: - X = joblib.load(samples_path) - sample_id = joblib.load(sample_id_path) + X, sample_id = _refresh_cache([samples_path, sample_id_path], 9) + # Revert to the following two lines in v0.23 + # X = joblib.load(samples_path) + # sample_id = joblib.load(sample_id_path) # load target (y), categories, and sample_id_bis if download_if_missing and (not exists(sample_topics_path) or @@ -243,8 +246,10 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, joblib.dump(y, sample_topics_path, compress=9) joblib.dump(categories, topics_path, compress=9) else: - y = joblib.load(sample_topics_path) - categories = joblib.load(topics_path) + y, categories = _refresh_cache([sample_topics_path, topics_path], 9) + # Revert to the following two lines in v0.23 + # y = joblib.load(sample_topics_path) + # categories = joblib.load(topics_path) if subset == 'all': pass diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py index f9a04f92b8486..1006bd58c62f0 100644 --- a/sklearn/datasets/species_distributions.py +++ b/sklearn/datasets/species_distributions.py @@ -51,6 +51,7 @@ from .base import RemoteFileMetadata from ..utils import Bunch from .base import _pkl_filepath +from .base import _refresh_cache # The original data can be found at: # https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip @@ -259,6 +260,8 @@ def fetch_species_distributions(data_home=None, **extra_params) joblib.dump(bunch, archive_path, compress=9) else: - bunch = joblib.load(archive_path) + bunch = _refresh_cache([archive_path], 9) + # Revert to the following two lines in v0.23 + # bunch = joblib.load(archive_path) return bunch From 6fe91a284e8745af2c1d1ab7464169b35b0bd9ca Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 28 Jun 2019 11:43:30 +0200 Subject: [PATCH 10/18] dataset specific message --- sklearn/datasets/base.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 09bf03925e3ad..10016b24afcca 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -932,24 +932,21 @@ def _refresh_cache(files, compress): refresh_needed = any([str(x.message).startswith(msg) for x in warns]) if refresh_needed: - raise_joblib = False try: for value, path in zip(data, files): joblib.dump(value, path, compress=compress) except IOError: - raise_joblib = True + message = ("This dataset will stop being loadable in scikit-learn " + "version 0.23 because it references a deprecated " + "import path. Consider removing the following files " + "and allowing it to be cached anew:\n%s" + % ("\n".join(files))) + warnings.warn(message=message, category=DeprecationWarning) other_warns = [w for w in warns if not str(w.message).startswith(msg)] for w in other_warns: warnings.warn(message=w.message, category=w.category) - if raise_joblib: - joblib_warning = [w for w in warns - if str(w.message).startswith(msg)][0] - message = str(joblib_warning.message) + ( - " The persisted files are located under: %s" % path) - warnings.warn(message=message, category=joblib_warning.category) - if len(data) == 1: return data[0] else: From aac769ea2865e5f9a1429b7df41c17c0d7e7d80c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 30 Jun 2019 14:12:02 +0200 Subject: [PATCH 11/18] add tests, always raise unrelated warnings --- sklearn/datasets/base.py | 8 +++--- sklearn/datasets/tests/test_base.py | 42 +++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 10016b24afcca..25a16ea1bad88 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -931,6 +931,10 @@ def _refresh_cache(files, compress): refresh_needed = any([str(x.message).startswith(msg) for x in warns]) + other_warns = [w for w in warns if not str(w.message).startswith(msg)] + for w in other_warns: + warnings.warn(message=w.message, category=w.category) + if refresh_needed: try: for value, path in zip(data, files): @@ -943,10 +947,6 @@ def _refresh_cache(files, compress): % ("\n".join(files))) warnings.warn(message=message, category=DeprecationWarning) - other_warns = [w for w in warns if not str(w.message).startswith(msg)] - for w in other_warns: - warnings.warn(message=w.message, category=w.category) - if len(data) == 1: return data[0] else: diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 676cb00fd16f8..89171143bf579 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -8,6 +8,7 @@ from functools import partial import pytest +import joblib import numpy as np from sklearn.datasets import get_data_home @@ -23,6 +24,7 @@ from sklearn.datasets import load_boston from sklearn.datasets import load_wine from sklearn.datasets.base import Bunch +from sklearn.datasets.base import _refresh_cache from sklearn.datasets.tests.test_common import check_return_X_y from sklearn.externals._pilutil import pillow_installed @@ -277,3 +279,43 @@ def test_bunch_dir(): # check that dir (important for autocomplete) shows attributes data = load_iris() assert "data" in dir(data) + + +def test_refresh_cache(monkeypatch): + def _load_warn(*args, **kwargs): + msg = ("sklearn.externals.joblib is deprecated in 0.21 and will be " + "removed in 0.23. Please import this functionality directly " + "from joblib, which can be installed with: pip install joblib. " + "If this warning is raised when loading pickled models, you " + "may need to re-serialize those models with scikit-learn " + "0.21+.") + warnings.warn(msg, DeprecationWarning) + return 0 + + def _load_warn_unrelated(*args, **kwargs): + warnings.warn("unrelated warning", UserWarning) + return 0 + + def _dump_safe(*args, **kwargs): + pass + + def _dump_raise(*args, **kwargs): + raise IOError() + + monkeypatch.setattr(joblib, "load", _load_warn) + monkeypatch.setattr(joblib, "dump", _dump_raise) + msg = "This dataset will stop being loadable in scikit-learn" + with pytest.warns(DeprecationWarning, match=msg): + _refresh_cache('test', 0) + + monkeypatch.setattr(joblib, "load", _load_warn) + monkeypatch.setattr(joblib, "dump", _dump_safe) + with warnings.catch_warnings(record=True) as warns: + _refresh_cache('test', 0) + + assert len(warns) == 0 + + monkeypatch.setattr(joblib, "load", _load_warn_unrelated) + monkeypatch.setattr(joblib, "dump", _dump_safe) + with pytest.warns(UserWarning, match="unrelated warning"): + _refresh_cache('test', 0) From 8a772eb9edc679ea91ced37f846582e46998a2d5 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 30 Jun 2019 14:18:53 +0200 Subject: [PATCH 12/18] add test comments --- sklearn/datasets/tests/test_base.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 89171143bf579..e9c0beb654da7 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -283,6 +283,8 @@ def test_bunch_dir(): def test_refresh_cache(monkeypatch): def _load_warn(*args, **kwargs): + # raise the warning from "externals.joblib.__init__.py" + # this is raised when a file persisted by the old joblib is loaded now msg = ("sklearn.externals.joblib is deprecated in 0.21 and will be " "removed in 0.23. Please import this functionality directly " "from joblib, which can be installed with: pip install joblib. " @@ -300,21 +302,28 @@ def _dump_safe(*args, **kwargs): pass def _dump_raise(*args, **kwargs): + # this happens if the file is read-only and joblib.dump fails to write + # on it. raise IOError() + # test if the dataset spesific warning is raised if load raises the joblib + # warning, and dump fails to dump with new joblib monkeypatch.setattr(joblib, "load", _load_warn) monkeypatch.setattr(joblib, "dump", _dump_raise) msg = "This dataset will stop being loadable in scikit-learn" with pytest.warns(DeprecationWarning, match=msg): _refresh_cache('test', 0) + # make sure no warning is raised if load raises the warning, but dump + # manages to dump the new data monkeypatch.setattr(joblib, "load", _load_warn) monkeypatch.setattr(joblib, "dump", _dump_safe) with warnings.catch_warnings(record=True) as warns: _refresh_cache('test', 0) - assert len(warns) == 0 + # test if an unrelated warning is still passed through and not suppressed + # by _refresh_cache monkeypatch.setattr(joblib, "load", _load_warn_unrelated) monkeypatch.setattr(joblib, "dump", _dump_safe) with pytest.warns(UserWarning, match="unrelated warning"): From 2adbfd75970388224bbb560382ee76d8a6bca091 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 30 Jun 2019 14:24:29 +0200 Subject: [PATCH 13/18] add pytests mokneypatch link --- sklearn/datasets/tests/test_base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index e9c0beb654da7..ba3deec4c6d3d 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -282,6 +282,9 @@ def test_bunch_dir(): def test_refresh_cache(monkeypatch): + # uses pytests monkeypatch fixture + # https://docs.pytest.org/en/latest/monkeypatch.html + def _load_warn(*args, **kwargs): # raise the warning from "externals.joblib.__init__.py" # this is raised when a file persisted by the old joblib is loaded now From 532d57f38e6d9f23c798eb571ab9d01e3837e091 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 30 Jun 2019 15:22:55 +0200 Subject: [PATCH 14/18] use pytest.warns to record warnings --- sklearn/datasets/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index ba3deec4c6d3d..5b77cbda30d1d 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -321,7 +321,7 @@ def _dump_raise(*args, **kwargs): # manages to dump the new data monkeypatch.setattr(joblib, "load", _load_warn) monkeypatch.setattr(joblib, "dump", _dump_safe) - with warnings.catch_warnings(record=True) as warns: + with pytest.warns(None) as warns: _refresh_cache('test', 0) assert len(warns) == 0 From 98fc6decf3206d55360735a808e3693ed1a4bf4a Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 30 Jun 2019 18:30:32 +0200 Subject: [PATCH 15/18] UserWarning -> DeprecationWarning --- sklearn/datasets/tests/test_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 5b77cbda30d1d..b90894b5430c3 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -298,7 +298,7 @@ def _load_warn(*args, **kwargs): return 0 def _load_warn_unrelated(*args, **kwargs): - warnings.warn("unrelated warning", UserWarning) + warnings.warn("unrelated warning", DeprecationWarning) return 0 def _dump_safe(*args, **kwargs): @@ -329,5 +329,5 @@ def _dump_raise(*args, **kwargs): # by _refresh_cache monkeypatch.setattr(joblib, "load", _load_warn_unrelated) monkeypatch.setattr(joblib, "dump", _dump_safe) - with pytest.warns(UserWarning, match="unrelated warning"): + with pytest.warns(DeprecationWarning, match="unrelated warning"): _refresh_cache('test', 0) From a06a2c890cc180aed223cc82f8774c10fbfb89b3 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 5 Jul 2019 12:21:18 +0200 Subject: [PATCH 16/18] apply comments --- doc/whats_new/v0.21.rst | 10 ++++++++++ sklearn/datasets/base.py | 7 ++----- sklearn/datasets/california_housing.py | 2 +- sklearn/datasets/covtype.py | 2 +- sklearn/datasets/kddcup99.py | 2 +- sklearn/datasets/olivetti_faces.py | 2 +- sklearn/datasets/rcv1.py | 4 ++-- sklearn/datasets/species_distributions.py | 2 +- 8 files changed, 19 insertions(+), 12 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 2e1c639e267b7..78c92ebf97534 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -12,6 +12,16 @@ Version 0.21.3 Changelog --------- +:mod:`sklearn.datasets` +....................... + +- |Fix| :func:`fetch_california_housing`, :func:`fetch_covtype`, + :func:`fetch_kddcup99`, :func:`fetch_olivetti_faces`, + :func:`fetch_rcv1`, and :func:`fetch_species_distributions` try to persist + the previously cache using the new ``joblib`` if the cahce was persisted + using the deprecated ``sklearn.externals.joblib``. This behavior is set to + be deprecated and removed in v0.23. :pr:`14197` by `Adrin Jalali`_. + :mod:`sklearn.impute` ..................... diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 25a16ea1bad88..c353746c1c326 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -923,7 +923,7 @@ def _fetch_remote(remote, dirname=None): def _refresh_cache(files, compress): - # REMOVE in v0.23 + # TODO: REMOVE in v0.23 import joblib msg = "sklearn.externals.joblib is deprecated in 0.21" with warnings.catch_warnings(record=True) as warns: @@ -947,7 +947,4 @@ def _refresh_cache(files, compress): % ("\n".join(files))) warnings.warn(message=message, category=DeprecationWarning) - if len(data) == 1: - return data[0] - else: - return data + return data[0] if len(data) == 1 else data diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py index 1ec001277020c..7d8b1aa3ede45 100644 --- a/sklearn/datasets/california_housing.py +++ b/sklearn/datasets/california_housing.py @@ -131,7 +131,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True, else: cal_housing = _refresh_cache([filepath], 6) - # Revert to the following two lines in v0.23 + # TODO: Revert to the following line in v0.23 # cal_housing = joblib.load(filepath) feature_names = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py index faa521dd03187..4108b1d79f84b 100644 --- a/sklearn/datasets/covtype.py +++ b/sklearn/datasets/covtype.py @@ -127,7 +127,7 @@ def fetch_covtype(data_home=None, download_if_missing=True, X, y except NameError: X, y = _refresh_cache([samples_path, targets_path], 9) - # Revert to the following two lines in v0.23 + # TODO: Revert to the following two lines in v0.23 # X = joblib.load(samples_path) # y = joblib.load(targets_path) diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py index dd3653dfe0083..f50f49f85ab6f 100644 --- a/sklearn/datasets/kddcup99.py +++ b/sklearn/datasets/kddcup99.py @@ -294,7 +294,7 @@ def _fetch_brute_kddcup99(data_home=None, X, y except NameError: X, y = _refresh_cache([samples_path, targets_path], 0) - # Revert to the following two lines in v0.23 + # TODO: Revert to the following two lines in v0.23 # X = joblib.load(samples_path) # y = joblib.load(targets_path) diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py index 52503554c485b..24eeb7927abcf 100644 --- a/sklearn/datasets/olivetti_faces.py +++ b/sklearn/datasets/olivetti_faces.py @@ -109,7 +109,7 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0, del mfile else: faces = _refresh_cache([filepath], 6) - # Revert to the following two lines in v0.23 + # TODO: Revert to the following line in v0.23 # faces = joblib.load(filepath) # We want floating point data, but float32 is enough (there is only diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py index b5f0a47065168..c000acf13e249 100644 --- a/sklearn/datasets/rcv1.py +++ b/sklearn/datasets/rcv1.py @@ -191,7 +191,7 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, remove(f.name) else: X, sample_id = _refresh_cache([samples_path, sample_id_path], 9) - # Revert to the following two lines in v0.23 + # TODO: Revert to the following two lines in v0.23 # X = joblib.load(samples_path) # sample_id = joblib.load(sample_id_path) @@ -247,7 +247,7 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, joblib.dump(categories, topics_path, compress=9) else: y, categories = _refresh_cache([sample_topics_path, topics_path], 9) - # Revert to the following two lines in v0.23 + # TODO: Revert to the following two lines in v0.23 # y = joblib.load(sample_topics_path) # categories = joblib.load(topics_path) diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py index 1006bd58c62f0..82ae22129ab9b 100644 --- a/sklearn/datasets/species_distributions.py +++ b/sklearn/datasets/species_distributions.py @@ -261,7 +261,7 @@ def fetch_species_distributions(data_home=None, joblib.dump(bunch, archive_path, compress=9) else: bunch = _refresh_cache([archive_path], 9) - # Revert to the following two lines in v0.23 + # TODO: Revert to the following line in v0.23 # bunch = joblib.load(archive_path) return bunch From 5713a236e9e3b0312cf3e35f458c82fe07750dd5 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 8 Jul 2019 14:07:13 +0200 Subject: [PATCH 17/18] cache -> cached data --- doc/whats_new/v0.21.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 78c92ebf97534..74d0c88194347 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -18,7 +18,7 @@ Changelog - |Fix| :func:`fetch_california_housing`, :func:`fetch_covtype`, :func:`fetch_kddcup99`, :func:`fetch_olivetti_faces`, :func:`fetch_rcv1`, and :func:`fetch_species_distributions` try to persist - the previously cache using the new ``joblib`` if the cahce was persisted + the previously cache using the new ``joblib`` if the cahced data was persisted using the deprecated ``sklearn.externals.joblib``. This behavior is set to be deprecated and removed in v0.23. :pr:`14197` by `Adrin Jalali`_. From 6cf82540cd35f34f320076263d41c959b4f1f91d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 9 Jul 2019 10:11:42 +0200 Subject: [PATCH 18/18] fixing references in whats_new --- doc/whats_new/v0.21.rst | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 74d0c88194347..cf3302ad62c00 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -15,12 +15,14 @@ Changelog :mod:`sklearn.datasets` ....................... -- |Fix| :func:`fetch_california_housing`, :func:`fetch_covtype`, - :func:`fetch_kddcup99`, :func:`fetch_olivetti_faces`, - :func:`fetch_rcv1`, and :func:`fetch_species_distributions` try to persist - the previously cache using the new ``joblib`` if the cahced data was persisted - using the deprecated ``sklearn.externals.joblib``. This behavior is set to - be deprecated and removed in v0.23. :pr:`14197` by `Adrin Jalali`_. +- |Fix| :func:`datasets.fetch_california_housing`, + :func:`datasets.fetch_covtype`, + :func:`datasets.fetch_kddcup99`, :func:`datasets.fetch_olivetti_faces`, + :func:`datasets.fetch_rcv1`, and :func:`datasets.fetch_species_distributions` + try to persist the previously cache using the new ``joblib`` if the cahced + data was persisted using the deprecated ``sklearn.externals.joblib``. This + behavior is set to be deprecated and removed in v0.23. + :pr:`14197` by `Adrin Jalali`_. :mod:`sklearn.impute` .....................