scikit-learn · ogrisel · Aug 3, 2017 · Sep 14, 2016 · Sep 14, 2016 · Sep 14, 2016
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
@@ -17,6 +17,11 @@
 import warnings
 import os
 from contextlib import contextmanager as _contextmanager
+import logging
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler())
+logger.setLevel(logging.INFO)
 
 _ASSUME_FINITE = bool(os.environ.get('SKLEARN_ASSUME_FINITE', False))
 

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
@@ -6,39 +6,40 @@
 #               2010 Fabian Pedregosa <[email protected]>
 #               2010 Olivier Grisel <[email protected]>
 # License: BSD 3 clause
+from __future__ import print_function
 
 import os
 import csv
 import sys
 import shutil
-from os import environ
-from os.path import dirname
-from os.path import join
-from os.path import exists
-from os.path import expanduser
-from os.path import isdir
-from os.path import splitext
-from os import listdir
-from os import makedirs
+from collections import namedtuple
+from os import environ, listdir, makedirs
+from os.path import dirname, exists, expanduser, isdir, join, splitext
+import hashlib
+
 from ..utils import Bunch
+from ..utils import check_random_state
 
 import numpy as np
 
-from ..utils import check_random_state
+from sklearn.externals.six.moves.urllib.request import urlretrieve
+
+RemoteFileMetadata = namedtuple('RemoteFileMetadata',
+                                ['filename', 'url', 'checksum'])
 
 
 def get_data_home(data_home=None):
     """Return the path of the scikit-learn data dir.
 
-    This folder is used by some large dataset loaders to avoid
-    downloading the data several times.
+    This folder is used by some large dataset loaders to avoid downloading the
+    data several times.
 
-    By default the data dir is set to a folder named 'scikit_learn_data'
-    in the user home folder.
+    By default the data dir is set to a folder named 'scikit_learn_data' in the
+    user home folder.
 
     Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment
-    variable or programmatically by giving an explicit folder path. The
-    '~' symbol is expanded to the user home folder.
+    variable or programmatically by giving an explicit folder path. The '~'
+    symbol is expanded to the user home folder.
 
     If the folder does not already exist, it is automatically created.
     """
@@ -76,23 +77,22 @@ def load_files(container_path, description=None, categories=None,
                 file_44.txt
                 ...
 
-    The folder names are used as supervised signal label names. The
-    individual file names are not important.
+    The folder names are used as supervised signal label names. The individual
+    file names are not important.
 
-    This function does not try to extract features into a numpy array or
-    scipy sparse matrix. In addition, if load_content is false it
-    does not try to load the files in memory.
+    This function does not try to extract features into a numpy array or scipy
+    sparse matrix. In addition, if load_content is false it does not try to
+    load the files in memory.
 
-    To use text files in a scikit-learn classification or clustering
-    algorithm, you will need to use the `sklearn.feature_extraction.text`
-    module to build a feature extraction transformer that suits your
-    problem.
+    To use text files in a scikit-learn classification or clustering algorithm,
+    you will need to use the `sklearn.feature_extraction.text` module to build
+    a feature extraction transformer that suits your problem.
 
-    If you set load_content=True, you should also specify the encoding of
-    the text using the 'encoding' parameter. For many modern text files,
-    'utf-8' will be the correct encoding. If you leave encoding equal to None,
-    then the content will be made of bytes instead of Unicode, and you will
-    not be able to use most functions in `sklearn.feature_extraction.text`.
+    If you set load_content=True, you should also specify the encoding of the
+    text using the 'encoding' parameter. For many modern text files, 'utf-8'
+    will be the correct encoding. If you leave encoding equal to None, then the
+    content will be made of bytes instead of Unicode, and you will not be able
+    to use most functions in `sklearn.feature_extraction.text`.
 
     Similar feature extractors should be built for other kind of unstructured
     data input such as images, audio, video, ...
@@ -109,20 +109,19 @@ def load_files(container_path, description=None, categories=None,
         reference, etc.
 
     categories : A collection of strings or None, optional (default=None)
-        If None (default), load all the categories.
-        If not None, list of category names to load (other categories ignored).
+        If None (default), load all the categories. If not None, list of
+        category names to load (other categories ignored).
 
     load_content : boolean, optional (default=True)
-        Whether to load or not the content of the different files. If
-        true a 'data' attribute containing the text information is present
-        in the data structure returned. If not, a filenames attribute
-        gives the path to the files.
+        Whether to load or not the content of the different files. If true a
+        'data' attribute containing the text information is present in the data
+        structure returned. If not, a filenames attribute gives the path to the
+        files.
 
     encoding : string or None (default is None)
-        If None, do not try to decode the content of the files (e.g. for
-        images or other non-text content).
-        If not None, encoding to use to decode text files to Unicode if
-        load_content is True.
+        If None, do not try to decode the content of the files (e.g. for images
+        or other non-text content). If not None, encoding to use to decode text
+        files to Unicode if load_content is True.
 
     decode_error : {'strict', 'ignore', 'replace'}, optional
         Instruction on what to do if a byte sequence is given to analyze that
@@ -262,16 +261,15 @@ def load_wine(return_X_y=False):
     Returns
     -------
     data : Bunch
-        Dictionary-like object, the interesting attributes are:
-        'data', the data to learn, 'target', the classification labels,
-        'target_names', the meaning of the labels, 'feature_names', the
-        meaning of the features, and 'DESCR', the
-        full description of the dataset.
+        Dictionary-like object, the interesting attributes are: 'data', the
+        data to learn, 'target', the classification labels, 'target_names', the
+        meaning of the labels, 'feature_names', the meaning of the features,
+        and 'DESCR', the full description of the dataset.
 
     (data, target) : tuple if ``return_X_y`` is True
 
-    The copy of UCI ML Wine Data Set dataset is
-    downloaded and modified to fit standard format from:
+    The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit
+    standard format from:
     https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
 
     Examples
@@ -332,8 +330,8 @@ def load_iris(return_X_y=False):
     Parameters
     ----------
     return_X_y : boolean, default=False.
-        If True, returns ``(data, target)`` instead of a Bunch object.
-        See below for more information about the `data` and `target` object.
+        If True, returns ``(data, target)`` instead of a Bunch object. See
+        below for more information about the `data` and `target` object.
 
         .. versionadded:: 0.18
 
@@ -709,15 +707,15 @@ def load_boston(return_X_y=False):
 
 def load_sample_images():
     """Load sample images for image manipulation.
+
     Loads both, ``china`` and ``flower``.
 
     Returns
     -------
     data : Bunch
-        Dictionary-like object with the following attributes :
-        'images', the two sample images, 'filenames', the file
-        names for the images, and 'DESCR'
-        the full description of the dataset.
+        Dictionary-like object with the following attributes : 'images', the
+        two sample images, 'filenames', the file names for the images, and
+        'DESCR' the full description of the dataset.
 
     Examples
     --------
@@ -799,18 +797,18 @@ def load_sample_image(image_name):
 def _pkl_filepath(*args, **kwargs):
     """Ensure different filenames for Python 2 and Python 3 pickles
 
-    An object pickled under Python 3 cannot be loaded under Python 2.
-    An object pickled under Python 2 can sometimes not be loaded
-    correctly under Python 3 because some Python 2 strings are decoded as
-    Python 3 strings which can be problematic for objects that use Python 2
-    strings as byte buffers for numerical data instead of "real" strings.
+    An object pickled under Python 3 cannot be loaded under Python 2. An object
+    pickled under Python 2 can sometimes not be loaded correctly under Python 3
+    because some Python 2 strings are decoded as Python 3 strings which can be
+    problematic for objects that use Python 2 strings as byte buffers for
+    numerical data instead of "real" strings.
 
     Therefore, dataset loaders in scikit-learn use different files for pickles
-    manages by Python 2 and Python 3 in the same SCIKIT_LEARN_DATA folder so
-    as to avoid conflicts.
+    manages by Python 2 and Python 3 in the same SCIKIT_LEARN_DATA folder so as
+    to avoid conflicts.
 
-    args[-1] is expected to be the ".pkl" filename. Under Python 3, a
-    suffix is inserted before the extension to s
+    args[-1] is expected to be the ".pkl" filename. Under Python 3, a suffix is
+    inserted before the extension to s
 
     _pkl_filepath('/path/to/folder', 'filename.pkl') returns:
       - /path/to/folder/filename.pkl under Python 2
@@ -823,3 +821,50 @@ def _pkl_filepath(*args, **kwargs):
         basename += py3_suffix
     new_args = args[:-1] + (basename + ext,)
     return join(*new_args)
+
+
+def _sha256(path):
+    """Calculate the sha256 hash of the file at path."""
+    sha256hash = hashlib.sha256()
+    chunk_size = 8192
+    with open(path, "rb") as f:
+        while True:
+            buffer = f.read(chunk_size)
+            if not buffer:
+                break
+            sha256hash.update(buffer)
+    return sha256hash.hexdigest()
+
+
+def _fetch_remote(remote, dirname=None):
+    """Helper function to download a remote dataset into path
+
+    Fetch a dataset pointed by remote's url, save into path using remote's
+    filename and ensure its integrity based on the SHA256 Checksum of the
+    downloaded file.
+
+    Parameters
+    -----------
+    remote : RemoteFileMetadata
+        Named tuple containing remote dataset meta information: url, filename
+        and checksum
+
+    dirname : string
+        Directory to save the file to.
+
+    Returns
+    -------
+    file_path: string
+        Full path of the created file.
+    """
+
+    file_path = (remote.filename if dirname is None
+                 else join(dirname, remote.filename))
+    urlretrieve(remote.url, file_path)
+    checksum = _sha256(file_path)
+    if remote.checksum != checksum:
+        raise IOError("{} has an SHA256 checksum ({}) "
+                      "differing from expected ({}), "
+                      "file may be corrupted.".format(file_path, checksum,
+                                                      remote.checksum))
+    return file_path
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
@@ -21,33 +21,33 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-from io import BytesIO
 from os.path import exists
-from os import makedirs
+from os import makedirs, remove
 import tarfile
 
-try:
-    # Python 2
-    from urllib2 import urlopen
-except ImportError:
-    # Python 3+
-    from urllib.request import urlopen
-
 import numpy as np
+import logging
 
 from .base import get_data_home
-from ..utils import Bunch
+from .base import _fetch_remote
 from .base import _pkl_filepath
+from .base import RemoteFileMetadata
+from ..utils import Bunch
 from ..externals import joblib
 
-
-DATA_URL = "http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz"
-TARGET_FILENAME = "cal_housing.pkz"
+# The original data can be found at:
+# http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz
+ARCHIVE = RemoteFileMetadata(
+    filename='cal_housing.tgz',
+    url='https://ndownloader.figshare.com/files/5976036',
+    checksum=('aaa5c9a6afe2225cc2aed2723682ae40'
+              '3280c4a3695a2ddda4ffb5d8215ea681'))
 
 # Grab the module-level docstring to use as a description of the
 # dataset
 MODULE_DOCS = __doc__
 
+logger = logging.getLogger(__name__)
 
 def fetch_california_housing(data_home=None, download_if_missing=True):
     """Loader for the California housing dataset from StatLib.
@@ -89,17 +89,20 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
     if not exists(data_home):
         makedirs(data_home)
 
-    filepath = _pkl_filepath(data_home, TARGET_FILENAME)
+    filepath = _pkl_filepath(data_home, 'cal_housing.pkz')
     if not exists(filepath):
         if not download_if_missing:
             raise IOError("Data not found and `download_if_missing` is False")
 
-        print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
-        archive_fileobj = BytesIO(urlopen(DATA_URL).read())
+        logger.info('Downloading Cal. housing from {} to {}'.format(
+            ARCHIVE.url, data_home))
+        archive_path = _fetch_remote(ARCHIVE, dirname=data_home)
+
         fileobj = tarfile.open(
             mode="r:gz",
-            fileobj=archive_fileobj).extractfile(
+            name=archive_path).extractfile(
                 'CaliforniaHousing/cal_housing.data')
+        remove(archive_path)
 
         cal_housing = np.loadtxt(fileobj, delimiter=',')
         # Columns are not in the same order compared to the previous