From 20fac4fe412970db70b17f42b5287de4a84c44d3 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Wed, 26 Jun 2019 15:11:08 +0200
Subject: [PATCH 01/18] introduce refresh_cache param to fetch_data

---
 sklearn/datasets/base.py    | 23 +++++++++++++++++++++++
 sklearn/datasets/covtype.py | 19 ++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 0b8f73c86117b..e60d4c87d63e5 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -10,6 +10,7 @@
 import csv
 import sys
 import shutil
+import warnings
 from collections import namedtuple
 from os import environ, listdir, makedirs
 from os.path import dirname, exists, expanduser, isdir, join, splitext
@@ -919,3 +920,25 @@ def _fetch_remote(remote, dirname=None):
                       "file may be corrupted.".format(file_path, checksum,
                                                       remote.checksum))
     return file_path
+
+
+def _refresh_cache(path, refresh_cache):
+    if not refresh_cache:
+        return
+
+    if refresh_cache == True:
+        shutil.rmtree(path)
+        return
+
+    import joblib
+    samples_path = _pkl_filepath(path, "samples")
+    targets_path = _pkl_filepath(path, "targets")
+    msg = "sklearn.externals.joblib is deprecated in 0.21"
+    with warnings.catch_warnings(record=True) as warns:
+        _ = joblib.load(samples_path)
+        _ = joblib.load(targets_path)
+
+        refresh_needed = any([str(x.message).startswith(msg) for x in warns])
+
+    if refresh_needed:
+        shutil.rmtree(path)
diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
index 9d995810bee3f..c89e7fdf88798 100644
--- a/sklearn/datasets/covtype.py
+++ b/sklearn/datasets/covtype.py
@@ -25,6 +25,7 @@
 from .base import get_data_home
 from .base import _fetch_remote
 from .base import RemoteFileMetadata
+from .base import _refresh_cache
 from ..utils import Bunch
 from .base import _pkl_filepath
 from ..utils import check_random_state
@@ -41,7 +42,8 @@
 
 
 def fetch_covtype(data_home=None, download_if_missing=True,
-                  random_state=None, shuffle=False, return_X_y=False):
+                  random_state=None, shuffle=False, return_X_y=False,
+                  refresh_cache='joblib'):
     """Load the covertype dataset (classification).
 
     Download it if necessary.
@@ -79,6 +81,17 @@ def fetch_covtype(data_home=None, download_if_missing=True,
 
         .. versionadded:: 0.20
 
+    refresh_cache : str or bool, optional (default='joblib')
+        - ``True``: remove the previously downloaded data, and fetche it again.
+        - ``'joblib'``: only re-fetch the data if the previously downloaded
+          data has been persisted using the previously vendored `joblib`.
+        - ``False``: do not re-fetch the data.
+        
+        From version 0.23, ``'joblib'`` as an input value will be ignored and
+        assumed ``False``.
+
+        .. versionadded:: 0.21.3
+
     Returns
     -------
     dataset : dict-like object with the following attributes:
@@ -96,12 +109,16 @@ def fetch_covtype(data_home=None, download_if_missing=True,
     (data, target) : tuple if ``return_X_y`` is True
 
         .. versionadded:: 0.20
+
     """
 
     data_home = get_data_home(data_home=data_home)
     covtype_dir = join(data_home, "covertype")
     samples_path = _pkl_filepath(covtype_dir, "samples")
     targets_path = _pkl_filepath(covtype_dir, "targets")
+
+    _refresh_cache(covtype_dir, refresh_cache)
+
     available = exists(samples_path)
 
     if download_if_missing and not available:

From 983da6c2f0f15e13a60361484e8394870b1416a3 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Wed, 26 Jun 2019 15:16:26 +0200
Subject: [PATCH 02/18] pep8

---
 sklearn/datasets/base.py    | 2 +-
 sklearn/datasets/covtype.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index e60d4c87d63e5..cdbb4938846d4 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -926,7 +926,7 @@ def _refresh_cache(path, refresh_cache):
     if not refresh_cache:
         return
 
-    if refresh_cache == True:
+    if refresh_cache is True:
         shutil.rmtree(path)
         return
 
diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
index c89e7fdf88798..e22e1ab0830d6 100644
--- a/sklearn/datasets/covtype.py
+++ b/sklearn/datasets/covtype.py
@@ -86,7 +86,7 @@ def fetch_covtype(data_home=None, download_if_missing=True,
         - ``'joblib'``: only re-fetch the data if the previously downloaded
           data has been persisted using the previously vendored `joblib`.
         - ``False``: do not re-fetch the data.
-        
+
         From version 0.23, ``'joblib'`` as an input value will be ignored and
         assumed ``False``.
 

From ef7160c9cd8e493009abaab6ff748e52bc05e528 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Wed, 26 Jun 2019 15:19:42 +0200
Subject: [PATCH 03/18] remove irrelevant line

---
 sklearn/datasets/covtype.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
index e22e1ab0830d6..fad123de27a7a 100644
--- a/sklearn/datasets/covtype.py
+++ b/sklearn/datasets/covtype.py
@@ -109,7 +109,6 @@ def fetch_covtype(data_home=None, download_if_missing=True,
     (data, target) : tuple if ``return_X_y`` is True
 
         .. versionadded:: 0.20
-
     """
 
     data_home = get_data_home(data_home=data_home)

From 0545b305192427959bf3b888491caa4b177008b4 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 27 Jun 2019 14:59:22 +0200
Subject: [PATCH 04/18] do not re-download

---
 sklearn/datasets/base.py    | 22 +++++++++++-----------
 sklearn/datasets/covtype.py | 20 ++++----------------
 2 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index cdbb4938846d4..f6c4bf76de58a 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -922,23 +922,23 @@ def _fetch_remote(remote, dirname=None):
     return file_path
 
 
-def _refresh_cache(path, refresh_cache):
-    if not refresh_cache:
-        return
-
-    if refresh_cache is True:
-        shutil.rmtree(path)
-        return
-
+def _refresh_cache(path):
+    # REMOVE in v0.23
     import joblib
     samples_path = _pkl_filepath(path, "samples")
     targets_path = _pkl_filepath(path, "targets")
     msg = "sklearn.externals.joblib is deprecated in 0.21"
     with warnings.catch_warnings(record=True) as warns:
-        _ = joblib.load(samples_path)
-        _ = joblib.load(targets_path)
+        X = joblib.load(samples_path)
+        y = joblib.load(targets_path)
 
         refresh_needed = any([str(x.message).startswith(msg) for x in warns])
 
     if refresh_needed:
-        shutil.rmtree(path)
+        try:
+            joblib.dump(X, samples_path, compress=9)
+            joblib.dump(y, samples_path, compress=9)
+        except IOError:
+            pass
+
+    return X, y
\ No newline at end of file
diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
index fad123de27a7a..8c8f97f0b7b61 100644
--- a/sklearn/datasets/covtype.py
+++ b/sklearn/datasets/covtype.py
@@ -81,17 +81,6 @@ def fetch_covtype(data_home=None, download_if_missing=True,
 
         .. versionadded:: 0.20
 
-    refresh_cache : str or bool, optional (default='joblib')
-        - ``True``: remove the previously downloaded data, and fetche it again.
-        - ``'joblib'``: only re-fetch the data if the previously downloaded
-          data has been persisted using the previously vendored `joblib`.
-        - ``False``: do not re-fetch the data.
-
-        From version 0.23, ``'joblib'`` as an input value will be ignored and
-        assumed ``False``.
-
-        .. versionadded:: 0.21.3
-
     Returns
     -------
     dataset : dict-like object with the following attributes:
@@ -115,9 +104,6 @@ def fetch_covtype(data_home=None, download_if_missing=True,
     covtype_dir = join(data_home, "covertype")
     samples_path = _pkl_filepath(covtype_dir, "samples")
     targets_path = _pkl_filepath(covtype_dir, "targets")
-
-    _refresh_cache(covtype_dir, refresh_cache)
-
     available = exists(samples_path)
 
     if download_if_missing and not available:
@@ -141,8 +127,10 @@ def fetch_covtype(data_home=None, download_if_missing=True,
     try:
         X, y
     except NameError:
-        X = joblib.load(samples_path)
-        y = joblib.load(targets_path)
+        X, y = _refresh_cache(covtype_dir)
+        # Revert to the following two lines in v0.23
+        # X = joblib.load(samples_path)
+        # y = joblib.load(targets_path)
 
     if shuffle:
         ind = np.arange(X.shape[0])

From 21a19622e54d767bce2dbc4711e7d76b6c865b5f Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 27 Jun 2019 15:00:14 +0200
Subject: [PATCH 05/18] pep8

---
 sklearn/datasets/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index f6c4bf76de58a..f519862b72d56 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -941,4 +941,4 @@ def _refresh_cache(path):
         except IOError:
             pass
 
-    return X, y
\ No newline at end of file
+    return X, y

From 65671ac58f30e23f1464affce332627108f34475 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 27 Jun 2019 15:01:01 +0200
Subject: [PATCH 06/18] remove param

---
 sklearn/datasets/covtype.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
index 8c8f97f0b7b61..24a9c8dfc108f 100644
--- a/sklearn/datasets/covtype.py
+++ b/sklearn/datasets/covtype.py
@@ -42,8 +42,7 @@
 
 
 def fetch_covtype(data_home=None, download_if_missing=True,
-                  random_state=None, shuffle=False, return_X_y=False,
-                  refresh_cache='joblib'):
+                  random_state=None, shuffle=False, return_X_y=False):
     """Load the covertype dataset (classification).
 
     Download it if necessary.

From 4b5fa967037e9ded8adeec4553760dee327da93b Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 27 Jun 2019 15:54:10 +0200
Subject: [PATCH 07/18] raise warnings

---
 sklearn/datasets/base.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index f519862b72d56..b4793f79ce7d0 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -941,4 +941,15 @@ def _refresh_cache(path):
         except IOError:
             pass
 
+        other_warns = [w for w in warns if not str(w.message).startswith(msg)]
+        joblib_warning = [w for w in warns
+                          if str(w.message).startswith(msg)][0]
+
+        for w in other_warns:
+            warnings.warn(message=w.message, category=w.category)
+
+        message = str(joblib_warning.message) + (
+            " The persisted files are located under: %s" % path)
+        warnings.warn(message=message, category=joblib_warning.category)
+
     return X, y

From f9b34a7b1bbae9ed4f256de16e060059406d1a49 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 27 Jun 2019 15:55:13 +0200
Subject: [PATCH 08/18] raise warnings, when needed

---
 sklearn/datasets/base.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index b4793f79ce7d0..c4f5163a388b1 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -935,21 +935,22 @@ def _refresh_cache(path):
         refresh_needed = any([str(x.message).startswith(msg) for x in warns])
 
     if refresh_needed:
+        raise_joblib = False
         try:
             joblib.dump(X, samples_path, compress=9)
             joblib.dump(y, samples_path, compress=9)
         except IOError:
-            pass
+            raise_joblib = True
 
         other_warns = [w for w in warns if not str(w.message).startswith(msg)]
-        joblib_warning = [w for w in warns
-                          if str(w.message).startswith(msg)][0]
-
         for w in other_warns:
             warnings.warn(message=w.message, category=w.category)
 
-        message = str(joblib_warning.message) + (
-            " The persisted files are located under: %s" % path)
-        warnings.warn(message=message, category=joblib_warning.category)
+        if raise_joblib:
+            joblib_warning = [w for w in warns
+                              if str(w.message).startswith(msg)][0]
+            message = str(joblib_warning.message) + (
+                " The persisted files are located under: %s" % path)
+            warnings.warn(message=message, category=joblib_warning.category)
 
     return X, y

From 5baba644033478972c709204b66d4547bba7103f Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 27 Jun 2019 23:09:51 +0200
Subject: [PATCH 09/18] fix the other fetch_... instances

---
 sklearn/datasets/base.py                  | 18 +++++++++---------
 sklearn/datasets/california_housing.py    |  5 ++++-
 sklearn/datasets/covtype.py               |  2 +-
 sklearn/datasets/kddcup99.py              |  7 +++++--
 sklearn/datasets/olivetti_faces.py        |  5 ++++-
 sklearn/datasets/rcv1.py                  | 13 +++++++++----
 sklearn/datasets/species_distributions.py |  5 ++++-
 7 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index c4f5163a388b1..09bf03925e3ad 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -922,23 +922,20 @@ def _fetch_remote(remote, dirname=None):
     return file_path
 
 
-def _refresh_cache(path):
+def _refresh_cache(files, compress):
     # REMOVE in v0.23
     import joblib
-    samples_path = _pkl_filepath(path, "samples")
-    targets_path = _pkl_filepath(path, "targets")
     msg = "sklearn.externals.joblib is deprecated in 0.21"
     with warnings.catch_warnings(record=True) as warns:
-        X = joblib.load(samples_path)
-        y = joblib.load(targets_path)
+        data = tuple([joblib.load(f) for f in files])
 
-        refresh_needed = any([str(x.message).startswith(msg) for x in warns])
+    refresh_needed = any([str(x.message).startswith(msg) for x in warns])
 
     if refresh_needed:
         raise_joblib = False
         try:
-            joblib.dump(X, samples_path, compress=9)
-            joblib.dump(y, samples_path, compress=9)
+            for value, path in zip(data, files):
+                joblib.dump(value, path, compress=compress)
         except IOError:
             raise_joblib = True
 
@@ -953,4 +950,7 @@ def _refresh_cache(path):
                 " The persisted files are located under: %s" % path)
             warnings.warn(message=message, category=joblib_warning.category)
 
-    return X, y
+    if len(data) == 1:
+        return data[0]
+    else:
+        return data
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
index 35f0847c1de05..1ec001277020c 100644
--- a/sklearn/datasets/california_housing.py
+++ b/sklearn/datasets/california_housing.py
@@ -34,6 +34,7 @@
 from .base import _fetch_remote
 from .base import _pkl_filepath
 from .base import RemoteFileMetadata
+from .base import _refresh_cache
 from ..utils import Bunch
 
 # The original data can be found at:
@@ -129,7 +130,9 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
         remove(archive_path)
 
     else:
-        cal_housing = joblib.load(filepath)
+        cal_housing = _refresh_cache([filepath], 6)
+        # Revert to the following two lines in v0.23
+        # cal_housing = joblib.load(filepath)
 
     feature_names = ["MedInc", "HouseAge", "AveRooms", "AveBedrms",
                      "Population", "AveOccup", "Latitude", "Longitude"]
diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
index 24a9c8dfc108f..faa521dd03187 100644
--- a/sklearn/datasets/covtype.py
+++ b/sklearn/datasets/covtype.py
@@ -126,7 +126,7 @@ def fetch_covtype(data_home=None, download_if_missing=True,
     try:
         X, y
     except NameError:
-        X, y = _refresh_cache(covtype_dir)
+        X, y = _refresh_cache([samples_path, targets_path], 9)
         # Revert to the following two lines in v0.23
         # X = joblib.load(samples_path)
         # y = joblib.load(targets_path)
diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py
index 837a489e7212c..dd3653dfe0083 100644
--- a/sklearn/datasets/kddcup99.py
+++ b/sklearn/datasets/kddcup99.py
@@ -20,6 +20,7 @@
 from .base import _fetch_remote
 from .base import get_data_home
 from .base import RemoteFileMetadata
+from .base import _refresh_cache
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import shuffle as shuffle_method
@@ -292,8 +293,10 @@ def _fetch_brute_kddcup99(data_home=None,
     try:
         X, y
     except NameError:
-        X = joblib.load(samples_path)
-        y = joblib.load(targets_path)
+        X, y = _refresh_cache([samples_path, targets_path], 0)
+        # Revert to the following two lines in v0.23
+        # X = joblib.load(samples_path)
+        # y = joblib.load(targets_path)
 
     return Bunch(data=X, target=y)
 
diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py
index a52f90414e104..52503554c485b 100644
--- a/sklearn/datasets/olivetti_faces.py
+++ b/sklearn/datasets/olivetti_faces.py
@@ -24,6 +24,7 @@
 from .base import _fetch_remote
 from .base import RemoteFileMetadata
 from .base import _pkl_filepath
+from .base import _refresh_cache
 from ..utils import check_random_state, Bunch
 
 # The original data can be found at:
@@ -107,7 +108,9 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
         joblib.dump(faces, filepath, compress=6)
         del mfile
     else:
-        faces = joblib.load(filepath)
+        faces = _refresh_cache([filepath], 6)
+        # Revert to the following two lines in v0.23
+        # faces = joblib.load(filepath)
 
     # We want floating point data, but float32 is enough (there is only
     # one byte of precision in the original uint8s anyway)
diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py
index c95cf1d1be75a..b5f0a47065168 100644
--- a/sklearn/datasets/rcv1.py
+++ b/sklearn/datasets/rcv1.py
@@ -22,6 +22,7 @@
 from .base import _pkl_filepath
 from .base import _fetch_remote
 from .base import RemoteFileMetadata
+from .base import _refresh_cache
 from .svmlight_format import load_svmlight_files
 from ..utils import shuffle as shuffle_
 from ..utils import Bunch
@@ -189,8 +190,10 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
             f.close()
             remove(f.name)
     else:
-        X = joblib.load(samples_path)
-        sample_id = joblib.load(sample_id_path)
+        X, sample_id = _refresh_cache([samples_path, sample_id_path], 9)
+        # Revert to the following two lines in v0.23
+        # X = joblib.load(samples_path)
+        # sample_id = joblib.load(sample_id_path)
 
     # load target (y), categories, and sample_id_bis
     if download_if_missing and (not exists(sample_topics_path) or
@@ -243,8 +246,10 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
         joblib.dump(y, sample_topics_path, compress=9)
         joblib.dump(categories, topics_path, compress=9)
     else:
-        y = joblib.load(sample_topics_path)
-        categories = joblib.load(topics_path)
+        y, categories = _refresh_cache([sample_topics_path, topics_path], 9)
+        # Revert to the following two lines in v0.23
+        # y = joblib.load(sample_topics_path)
+        # categories = joblib.load(topics_path)
 
     if subset == 'all':
         pass
diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
index f9a04f92b8486..1006bd58c62f0 100644
--- a/sklearn/datasets/species_distributions.py
+++ b/sklearn/datasets/species_distributions.py
@@ -51,6 +51,7 @@
 from .base import RemoteFileMetadata
 from ..utils import Bunch
 from .base import _pkl_filepath
+from .base import _refresh_cache
 
 # The original data can be found at:
 # https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip
@@ -259,6 +260,8 @@ def fetch_species_distributions(data_home=None,
                       **extra_params)
         joblib.dump(bunch, archive_path, compress=9)
     else:
-        bunch = joblib.load(archive_path)
+        bunch = _refresh_cache([archive_path], 9)
+        # Revert to the following two lines in v0.23
+        # bunch = joblib.load(archive_path)
 
     return bunch

From 6fe91a284e8745af2c1d1ab7464169b35b0bd9ca Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Fri, 28 Jun 2019 11:43:30 +0200
Subject: [PATCH 10/18] dataset specific message

---
 sklearn/datasets/base.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 09bf03925e3ad..10016b24afcca 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -932,24 +932,21 @@ def _refresh_cache(files, compress):
     refresh_needed = any([str(x.message).startswith(msg) for x in warns])
 
     if refresh_needed:
-        raise_joblib = False
         try:
             for value, path in zip(data, files):
                 joblib.dump(value, path, compress=compress)
         except IOError:
-            raise_joblib = True
+            message = ("This dataset will stop being loadable in scikit-learn "
+                       "version 0.23 because it references a deprecated "
+                       "import path. Consider removing the following files "
+                       "and allowing it to be cached anew:\n%s"
+                       % ("\n".join(files)))
+            warnings.warn(message=message, category=DeprecationWarning)
 
         other_warns = [w for w in warns if not str(w.message).startswith(msg)]
         for w in other_warns:
             warnings.warn(message=w.message, category=w.category)
 
-        if raise_joblib:
-            joblib_warning = [w for w in warns
-                              if str(w.message).startswith(msg)][0]
-            message = str(joblib_warning.message) + (
-                " The persisted files are located under: %s" % path)
-            warnings.warn(message=message, category=joblib_warning.category)
-
     if len(data) == 1:
         return data[0]
     else:

From aac769ea2865e5f9a1429b7df41c17c0d7e7d80c Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Sun, 30 Jun 2019 14:12:02 +0200
Subject: [PATCH 11/18] add tests, always raise unrelated warnings

---
 sklearn/datasets/base.py            |  8 +++---
 sklearn/datasets/tests/test_base.py | 42 +++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 10016b24afcca..25a16ea1bad88 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -931,6 +931,10 @@ def _refresh_cache(files, compress):
 
     refresh_needed = any([str(x.message).startswith(msg) for x in warns])
 
+    other_warns = [w for w in warns if not str(w.message).startswith(msg)]
+    for w in other_warns:
+        warnings.warn(message=w.message, category=w.category)
+
     if refresh_needed:
         try:
             for value, path in zip(data, files):
@@ -943,10 +947,6 @@ def _refresh_cache(files, compress):
                        % ("\n".join(files)))
             warnings.warn(message=message, category=DeprecationWarning)
 
-        other_warns = [w for w in warns if not str(w.message).startswith(msg)]
-        for w in other_warns:
-            warnings.warn(message=w.message, category=w.category)
-
     if len(data) == 1:
         return data[0]
     else:
diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
index 676cb00fd16f8..89171143bf579 100644
--- a/sklearn/datasets/tests/test_base.py
+++ b/sklearn/datasets/tests/test_base.py
@@ -8,6 +8,7 @@
 from functools import partial
 
 import pytest
+import joblib
 
 import numpy as np
 from sklearn.datasets import get_data_home
@@ -23,6 +24,7 @@
 from sklearn.datasets import load_boston
 from sklearn.datasets import load_wine
 from sklearn.datasets.base import Bunch
+from sklearn.datasets.base import _refresh_cache
 from sklearn.datasets.tests.test_common import check_return_X_y
 
 from sklearn.externals._pilutil import pillow_installed
@@ -277,3 +279,43 @@ def test_bunch_dir():
     # check that dir (important for autocomplete) shows attributes
     data = load_iris()
     assert "data" in dir(data)
+
+
+def test_refresh_cache(monkeypatch):
+    def _load_warn(*args, **kwargs):
+        msg = ("sklearn.externals.joblib is deprecated in 0.21 and will be "
+               "removed in 0.23. Please import this functionality directly "
+               "from joblib, which can be installed with: pip install joblib. "
+               "If this warning is raised when loading pickled models, you "
+               "may need to re-serialize those models with scikit-learn "
+               "0.21+.")
+        warnings.warn(msg, DeprecationWarning)
+        return 0
+
+    def _load_warn_unrelated(*args, **kwargs):
+        warnings.warn("unrelated warning", UserWarning)
+        return 0
+
+    def _dump_safe(*args, **kwargs):
+        pass
+
+    def _dump_raise(*args, **kwargs):
+        raise IOError()
+
+    monkeypatch.setattr(joblib, "load", _load_warn)
+    monkeypatch.setattr(joblib, "dump", _dump_raise)
+    msg = "This dataset will stop being loadable in scikit-learn"
+    with pytest.warns(DeprecationWarning, match=msg):
+        _refresh_cache('test', 0)
+
+    monkeypatch.setattr(joblib, "load", _load_warn)
+    monkeypatch.setattr(joblib, "dump", _dump_safe)
+    with warnings.catch_warnings(record=True) as warns:
+        _refresh_cache('test', 0)
+
+    assert len(warns) == 0
+
+    monkeypatch.setattr(joblib, "load", _load_warn_unrelated)
+    monkeypatch.setattr(joblib, "dump", _dump_safe)
+    with pytest.warns(UserWarning, match="unrelated warning"):
+        _refresh_cache('test', 0)

From 8a772eb9edc679ea91ced37f846582e46998a2d5 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Sun, 30 Jun 2019 14:18:53 +0200
Subject: [PATCH 12/18] add test comments

---
 sklearn/datasets/tests/test_base.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
index 89171143bf579..e9c0beb654da7 100644
--- a/sklearn/datasets/tests/test_base.py
+++ b/sklearn/datasets/tests/test_base.py
@@ -283,6 +283,8 @@ def test_bunch_dir():
 
 def test_refresh_cache(monkeypatch):
     def _load_warn(*args, **kwargs):
+        # raise the warning from "externals.joblib.__init__.py"
+        # this is raised when a file persisted by the old joblib is loaded now
         msg = ("sklearn.externals.joblib is deprecated in 0.21 and will be "
                "removed in 0.23. Please import this functionality directly "
                "from joblib, which can be installed with: pip install joblib. "
@@ -300,21 +302,28 @@ def _dump_safe(*args, **kwargs):
         pass
 
     def _dump_raise(*args, **kwargs):
+        # this happens if the file is read-only and joblib.dump fails to write
+        # on it.
         raise IOError()
 
+    # test if the dataset spesific warning is raised if load raises the joblib
+    # warning, and dump fails to dump with new joblib
     monkeypatch.setattr(joblib, "load", _load_warn)
     monkeypatch.setattr(joblib, "dump", _dump_raise)
     msg = "This dataset will stop being loadable in scikit-learn"
     with pytest.warns(DeprecationWarning, match=msg):
         _refresh_cache('test', 0)
 
+    # make sure no warning is raised if load raises the warning, but dump
+    # manages to dump the new data
     monkeypatch.setattr(joblib, "load", _load_warn)
     monkeypatch.setattr(joblib, "dump", _dump_safe)
     with warnings.catch_warnings(record=True) as warns:
         _refresh_cache('test', 0)
-
     assert len(warns) == 0
 
+    # test if an unrelated warning is still passed through and not suppressed
+    # by _refresh_cache
     monkeypatch.setattr(joblib, "load", _load_warn_unrelated)
     monkeypatch.setattr(joblib, "dump", _dump_safe)
     with pytest.warns(UserWarning, match="unrelated warning"):

From 2adbfd75970388224bbb560382ee76d8a6bca091 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Sun, 30 Jun 2019 14:24:29 +0200
Subject: [PATCH 13/18] add pytests mokneypatch link

---
 sklearn/datasets/tests/test_base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
index e9c0beb654da7..ba3deec4c6d3d 100644
--- a/sklearn/datasets/tests/test_base.py
+++ b/sklearn/datasets/tests/test_base.py
@@ -282,6 +282,9 @@ def test_bunch_dir():
 
 
 def test_refresh_cache(monkeypatch):
+    # uses pytests monkeypatch fixture
+    # https://docs.pytest.org/en/latest/monkeypatch.html
+
     def _load_warn(*args, **kwargs):
         # raise the warning from "externals.joblib.__init__.py"
         # this is raised when a file persisted by the old joblib is loaded now

From 532d57f38e6d9f23c798eb571ab9d01e3837e091 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Sun, 30 Jun 2019 15:22:55 +0200
Subject: [PATCH 14/18] use pytest.warns to record warnings

---
 sklearn/datasets/tests/test_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
index ba3deec4c6d3d..5b77cbda30d1d 100644
--- a/sklearn/datasets/tests/test_base.py
+++ b/sklearn/datasets/tests/test_base.py
@@ -321,7 +321,7 @@ def _dump_raise(*args, **kwargs):
     # manages to dump the new data
     monkeypatch.setattr(joblib, "load", _load_warn)
     monkeypatch.setattr(joblib, "dump", _dump_safe)
-    with warnings.catch_warnings(record=True) as warns:
+    with pytest.warns(None) as warns:
         _refresh_cache('test', 0)
     assert len(warns) == 0
 

From 98fc6decf3206d55360735a808e3693ed1a4bf4a Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Sun, 30 Jun 2019 18:30:32 +0200
Subject: [PATCH 15/18] UserWarning -> DeprecationWarning

---
 sklearn/datasets/tests/test_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
index 5b77cbda30d1d..b90894b5430c3 100644
--- a/sklearn/datasets/tests/test_base.py
+++ b/sklearn/datasets/tests/test_base.py
@@ -298,7 +298,7 @@ def _load_warn(*args, **kwargs):
         return 0
 
     def _load_warn_unrelated(*args, **kwargs):
-        warnings.warn("unrelated warning", UserWarning)
+        warnings.warn("unrelated warning", DeprecationWarning)
         return 0
 
     def _dump_safe(*args, **kwargs):
@@ -329,5 +329,5 @@ def _dump_raise(*args, **kwargs):
     # by _refresh_cache
     monkeypatch.setattr(joblib, "load", _load_warn_unrelated)
     monkeypatch.setattr(joblib, "dump", _dump_safe)
-    with pytest.warns(UserWarning, match="unrelated warning"):
+    with pytest.warns(DeprecationWarning, match="unrelated warning"):
         _refresh_cache('test', 0)

From a06a2c890cc180aed223cc82f8774c10fbfb89b3 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Fri, 5 Jul 2019 12:21:18 +0200
Subject: [PATCH 16/18] apply comments

---
 doc/whats_new/v0.21.rst                   | 10 ++++++++++
 sklearn/datasets/base.py                  |  7 ++-----
 sklearn/datasets/california_housing.py    |  2 +-
 sklearn/datasets/covtype.py               |  2 +-
 sklearn/datasets/kddcup99.py              |  2 +-
 sklearn/datasets/olivetti_faces.py        |  2 +-
 sklearn/datasets/rcv1.py                  |  4 ++--
 sklearn/datasets/species_distributions.py |  2 +-
 8 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 2e1c639e267b7..78c92ebf97534 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -12,6 +12,16 @@ Version 0.21.3
 Changelog
 ---------
 
+:mod:`sklearn.datasets`
+.......................
+
+- |Fix| :func:`fetch_california_housing`, :func:`fetch_covtype`,
+  :func:`fetch_kddcup99`, :func:`fetch_olivetti_faces`,
+  :func:`fetch_rcv1`, and :func:`fetch_species_distributions` try to persist
+  the previously cache using the new ``joblib`` if the cahce was persisted
+  using the deprecated ``sklearn.externals.joblib``. This behavior is set to
+  be deprecated and removed in v0.23. :pr:`14197` by `Adrin Jalali`_.
+
 :mod:`sklearn.impute`
 .....................
 
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 25a16ea1bad88..c353746c1c326 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -923,7 +923,7 @@ def _fetch_remote(remote, dirname=None):
 
 
 def _refresh_cache(files, compress):
-    # REMOVE in v0.23
+    # TODO: REMOVE in v0.23
     import joblib
     msg = "sklearn.externals.joblib is deprecated in 0.21"
     with warnings.catch_warnings(record=True) as warns:
@@ -947,7 +947,4 @@ def _refresh_cache(files, compress):
                        % ("\n".join(files)))
             warnings.warn(message=message, category=DeprecationWarning)
 
-    if len(data) == 1:
-        return data[0]
-    else:
-        return data
+    return data[0] if len(data) == 1 else data
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
index 1ec001277020c..7d8b1aa3ede45 100644
--- a/sklearn/datasets/california_housing.py
+++ b/sklearn/datasets/california_housing.py
@@ -131,7 +131,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
 
     else:
         cal_housing = _refresh_cache([filepath], 6)
-        # Revert to the following two lines in v0.23
+        # TODO: Revert to the following line in v0.23
         # cal_housing = joblib.load(filepath)
 
     feature_names = ["MedInc", "HouseAge", "AveRooms", "AveBedrms",
diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
index faa521dd03187..4108b1d79f84b 100644
--- a/sklearn/datasets/covtype.py
+++ b/sklearn/datasets/covtype.py
@@ -127,7 +127,7 @@ def fetch_covtype(data_home=None, download_if_missing=True,
         X, y
     except NameError:
         X, y = _refresh_cache([samples_path, targets_path], 9)
-        # Revert to the following two lines in v0.23
+        # TODO: Revert to the following two lines in v0.23
         # X = joblib.load(samples_path)
         # y = joblib.load(targets_path)
 
diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py
index dd3653dfe0083..f50f49f85ab6f 100644
--- a/sklearn/datasets/kddcup99.py
+++ b/sklearn/datasets/kddcup99.py
@@ -294,7 +294,7 @@ def _fetch_brute_kddcup99(data_home=None,
         X, y
     except NameError:
         X, y = _refresh_cache([samples_path, targets_path], 0)
-        # Revert to the following two lines in v0.23
+        # TODO: Revert to the following two lines in v0.23
         # X = joblib.load(samples_path)
         # y = joblib.load(targets_path)
 
diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py
index 52503554c485b..24eeb7927abcf 100644
--- a/sklearn/datasets/olivetti_faces.py
+++ b/sklearn/datasets/olivetti_faces.py
@@ -109,7 +109,7 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
         del mfile
     else:
         faces = _refresh_cache([filepath], 6)
-        # Revert to the following two lines in v0.23
+        # TODO: Revert to the following line in v0.23
         # faces = joblib.load(filepath)
 
     # We want floating point data, but float32 is enough (there is only
diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py
index b5f0a47065168..c000acf13e249 100644
--- a/sklearn/datasets/rcv1.py
+++ b/sklearn/datasets/rcv1.py
@@ -191,7 +191,7 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
             remove(f.name)
     else:
         X, sample_id = _refresh_cache([samples_path, sample_id_path], 9)
-        # Revert to the following two lines in v0.23
+        # TODO: Revert to the following two lines in v0.23
         # X = joblib.load(samples_path)
         # sample_id = joblib.load(sample_id_path)
 
@@ -247,7 +247,7 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
         joblib.dump(categories, topics_path, compress=9)
     else:
         y, categories = _refresh_cache([sample_topics_path, topics_path], 9)
-        # Revert to the following two lines in v0.23
+        # TODO: Revert to the following two lines in v0.23
         # y = joblib.load(sample_topics_path)
         # categories = joblib.load(topics_path)
 
diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
index 1006bd58c62f0..82ae22129ab9b 100644
--- a/sklearn/datasets/species_distributions.py
+++ b/sklearn/datasets/species_distributions.py
@@ -261,7 +261,7 @@ def fetch_species_distributions(data_home=None,
         joblib.dump(bunch, archive_path, compress=9)
     else:
         bunch = _refresh_cache([archive_path], 9)
-        # Revert to the following two lines in v0.23
+        # TODO: Revert to the following line in v0.23
         # bunch = joblib.load(archive_path)
 
     return bunch

From 5713a236e9e3b0312cf3e35f458c82fe07750dd5 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 8 Jul 2019 14:07:13 +0200
Subject: [PATCH 17/18] cache -> cached data

---
 doc/whats_new/v0.21.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 78c92ebf97534..74d0c88194347 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -18,7 +18,7 @@ Changelog
 - |Fix| :func:`fetch_california_housing`, :func:`fetch_covtype`,
   :func:`fetch_kddcup99`, :func:`fetch_olivetti_faces`,
   :func:`fetch_rcv1`, and :func:`fetch_species_distributions` try to persist
-  the previously cache using the new ``joblib`` if the cahce was persisted
+  the previously cache using the new ``joblib`` if the cahced data was persisted
   using the deprecated ``sklearn.externals.joblib``. This behavior is set to
   be deprecated and removed in v0.23. :pr:`14197` by `Adrin Jalali`_.
 

From 6cf82540cd35f34f320076263d41c959b4f1f91d Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Tue, 9 Jul 2019 10:11:42 +0200
Subject: [PATCH 18/18] fixing references in whats_new

---
 doc/whats_new/v0.21.rst | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 74d0c88194347..cf3302ad62c00 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -15,12 +15,14 @@ Changelog
 :mod:`sklearn.datasets`
 .......................
 
-- |Fix| :func:`fetch_california_housing`, :func:`fetch_covtype`,
-  :func:`fetch_kddcup99`, :func:`fetch_olivetti_faces`,
-  :func:`fetch_rcv1`, and :func:`fetch_species_distributions` try to persist
-  the previously cache using the new ``joblib`` if the cahced data was persisted
-  using the deprecated ``sklearn.externals.joblib``. This behavior is set to
-  be deprecated and removed in v0.23. :pr:`14197` by `Adrin Jalali`_.
+- |Fix| :func:`datasets.fetch_california_housing`,
+  :func:`datasets.fetch_covtype`,
+  :func:`datasets.fetch_kddcup99`, :func:`datasets.fetch_olivetti_faces`,
+  :func:`datasets.fetch_rcv1`, and :func:`datasets.fetch_species_distributions`
+  try to persist the previously cache using the new ``joblib`` if the cahced
+  data was persisted using the deprecated ``sklearn.externals.joblib``. This
+  behavior is set to be deprecated and removed in v0.23.
+  :pr:`14197` by `Adrin Jalali`_.
 
 :mod:`sklearn.impute`
 .....................