diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 3ca2a6814e70b..e74466efd8a95 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -17,6 +17,11 @@
 import warnings
 import os
 from contextlib import contextmanager as _contextmanager
+import logging
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler())
+logger.setLevel(logging.INFO)
 
 _ASSUME_FINITE = bool(os.environ.get('SKLEARN_ASSUME_FINITE', False))
 
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 1441daf838032..3d5ceb0a7abff 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -6,39 +6,40 @@
 #               2010 Fabian Pedregosa <fabian.pedregosa@inria.fr>
 #               2010 Olivier Grisel <olivier.grisel@ensta.org>
 # License: BSD 3 clause
+from __future__ import print_function
 
 import os
 import csv
 import sys
 import shutil
-from os import environ
-from os.path import dirname
-from os.path import join
-from os.path import exists
-from os.path import expanduser
-from os.path import isdir
-from os.path import splitext
-from os import listdir
-from os import makedirs
+from collections import namedtuple
+from os import environ, listdir, makedirs
+from os.path import dirname, exists, expanduser, isdir, join, splitext
+import hashlib
+
 from ..utils import Bunch
+from ..utils import check_random_state
 
 import numpy as np
 
-from ..utils import check_random_state
+from sklearn.externals.six.moves.urllib.request import urlretrieve
+
+RemoteFileMetadata = namedtuple('RemoteFileMetadata',
+                                ['filename', 'url', 'checksum'])
 
 
 def get_data_home(data_home=None):
     """Return the path of the scikit-learn data dir.
 
-    This folder is used by some large dataset loaders to avoid
-    downloading the data several times.
+    This folder is used by some large dataset loaders to avoid downloading the
+    data several times.
 
-    By default the data dir is set to a folder named 'scikit_learn_data'
-    in the user home folder.
+    By default the data dir is set to a folder named 'scikit_learn_data' in the
+    user home folder.
 
     Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment
-    variable or programmatically by giving an explicit folder path. The
-    '~' symbol is expanded to the user home folder.
+    variable or programmatically by giving an explicit folder path. The '~'
+    symbol is expanded to the user home folder.
 
     If the folder does not already exist, it is automatically created.
     """
@@ -76,23 +77,22 @@ def load_files(container_path, description=None, categories=None,
                 file_44.txt
                 ...
 
-    The folder names are used as supervised signal label names. The
-    individual file names are not important.
+    The folder names are used as supervised signal label names. The individual
+    file names are not important.
 
-    This function does not try to extract features into a numpy array or
-    scipy sparse matrix. In addition, if load_content is false it
-    does not try to load the files in memory.
+    This function does not try to extract features into a numpy array or scipy
+    sparse matrix. In addition, if load_content is false it does not try to
+    load the files in memory.
 
-    To use text files in a scikit-learn classification or clustering
-    algorithm, you will need to use the `sklearn.feature_extraction.text`
-    module to build a feature extraction transformer that suits your
-    problem.
+    To use text files in a scikit-learn classification or clustering algorithm,
+    you will need to use the `sklearn.feature_extraction.text` module to build
+    a feature extraction transformer that suits your problem.
 
-    If you set load_content=True, you should also specify the encoding of
-    the text using the 'encoding' parameter. For many modern text files,
-    'utf-8' will be the correct encoding. If you leave encoding equal to None,
-    then the content will be made of bytes instead of Unicode, and you will
-    not be able to use most functions in `sklearn.feature_extraction.text`.
+    If you set load_content=True, you should also specify the encoding of the
+    text using the 'encoding' parameter. For many modern text files, 'utf-8'
+    will be the correct encoding. If you leave encoding equal to None, then the
+    content will be made of bytes instead of Unicode, and you will not be able
+    to use most functions in `sklearn.feature_extraction.text`.
 
     Similar feature extractors should be built for other kind of unstructured
     data input such as images, audio, video, ...
@@ -109,20 +109,19 @@ def load_files(container_path, description=None, categories=None,
         reference, etc.
 
     categories : A collection of strings or None, optional (default=None)
-        If None (default), load all the categories.
-        If not None, list of category names to load (other categories ignored).
+        If None (default), load all the categories. If not None, list of
+        category names to load (other categories ignored).
 
     load_content : boolean, optional (default=True)
-        Whether to load or not the content of the different files. If
-        true a 'data' attribute containing the text information is present
-        in the data structure returned. If not, a filenames attribute
-        gives the path to the files.
+        Whether to load or not the content of the different files. If true a
+        'data' attribute containing the text information is present in the data
+        structure returned. If not, a filenames attribute gives the path to the
+        files.
 
     encoding : string or None (default is None)
-        If None, do not try to decode the content of the files (e.g. for
-        images or other non-text content).
-        If not None, encoding to use to decode text files to Unicode if
-        load_content is True.
+        If None, do not try to decode the content of the files (e.g. for images
+        or other non-text content). If not None, encoding to use to decode text
+        files to Unicode if load_content is True.
 
     decode_error : {'strict', 'ignore', 'replace'}, optional
         Instruction on what to do if a byte sequence is given to analyze that
@@ -262,16 +261,15 @@ def load_wine(return_X_y=False):
     Returns
     -------
     data : Bunch
-        Dictionary-like object, the interesting attributes are:
-        'data', the data to learn, 'target', the classification labels,
-        'target_names', the meaning of the labels, 'feature_names', the
-        meaning of the features, and 'DESCR', the
-        full description of the dataset.
+        Dictionary-like object, the interesting attributes are: 'data', the
+        data to learn, 'target', the classification labels, 'target_names', the
+        meaning of the labels, 'feature_names', the meaning of the features,
+        and 'DESCR', the full description of the dataset.
 
     (data, target) : tuple if ``return_X_y`` is True
 
-    The copy of UCI ML Wine Data Set dataset is
-    downloaded and modified to fit standard format from:
+    The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit
+    standard format from:
     https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
 
     Examples
@@ -332,8 +330,8 @@ def load_iris(return_X_y=False):
     Parameters
     ----------
     return_X_y : boolean, default=False.
-        If True, returns ``(data, target)`` instead of a Bunch object.
-        See below for more information about the `data` and `target` object.
+        If True, returns ``(data, target)`` instead of a Bunch object. See
+        below for more information about the `data` and `target` object.
 
         .. versionadded:: 0.18
 
@@ -709,15 +707,15 @@ def load_boston(return_X_y=False):
 
 def load_sample_images():
     """Load sample images for image manipulation.
+
     Loads both, ``china`` and ``flower``.
 
     Returns
     -------
     data : Bunch
-        Dictionary-like object with the following attributes :
-        'images', the two sample images, 'filenames', the file
-        names for the images, and 'DESCR'
-        the full description of the dataset.
+        Dictionary-like object with the following attributes : 'images', the
+        two sample images, 'filenames', the file names for the images, and
+        'DESCR' the full description of the dataset.
 
     Examples
     --------
@@ -799,18 +797,18 @@ def load_sample_image(image_name):
 def _pkl_filepath(*args, **kwargs):
     """Ensure different filenames for Python 2 and Python 3 pickles
 
-    An object pickled under Python 3 cannot be loaded under Python 2.
-    An object pickled under Python 2 can sometimes not be loaded
-    correctly under Python 3 because some Python 2 strings are decoded as
-    Python 3 strings which can be problematic for objects that use Python 2
-    strings as byte buffers for numerical data instead of "real" strings.
+    An object pickled under Python 3 cannot be loaded under Python 2. An object
+    pickled under Python 2 can sometimes not be loaded correctly under Python 3
+    because some Python 2 strings are decoded as Python 3 strings which can be
+    problematic for objects that use Python 2 strings as byte buffers for
+    numerical data instead of "real" strings.
 
     Therefore, dataset loaders in scikit-learn use different files for pickles
-    manages by Python 2 and Python 3 in the same SCIKIT_LEARN_DATA folder so
-    as to avoid conflicts.
+    manages by Python 2 and Python 3 in the same SCIKIT_LEARN_DATA folder so as
+    to avoid conflicts.
 
-    args[-1] is expected to be the ".pkl" filename. Under Python 3, a
-    suffix is inserted before the extension to s
+    args[-1] is expected to be the ".pkl" filename. Under Python 3, a suffix is
+    inserted before the extension to s
 
     _pkl_filepath('/path/to/folder', 'filename.pkl') returns:
       - /path/to/folder/filename.pkl under Python 2
@@ -823,3 +821,50 @@ def _pkl_filepath(*args, **kwargs):
         basename += py3_suffix
     new_args = args[:-1] + (basename + ext,)
     return join(*new_args)
+
+
+def _sha256(path):
+    """Calculate the sha256 hash of the file at path."""
+    sha256hash = hashlib.sha256()
+    chunk_size = 8192
+    with open(path, "rb") as f:
+        while True:
+            buffer = f.read(chunk_size)
+            if not buffer:
+                break
+            sha256hash.update(buffer)
+    return sha256hash.hexdigest()
+
+
+def _fetch_remote(remote, dirname=None):
+    """Helper function to download a remote dataset into path
+
+    Fetch a dataset pointed by remote's url, save into path using remote's
+    filename and ensure its integrity based on the SHA256 Checksum of the
+    downloaded file.
+
+    Parameters
+    -----------
+    remote : RemoteFileMetadata
+        Named tuple containing remote dataset meta information: url, filename
+        and checksum
+
+    dirname : string
+        Directory to save the file to.
+
+    Returns
+    -------
+    file_path: string
+        Full path of the created file.
+    """
+
+    file_path = (remote.filename if dirname is None
+                 else join(dirname, remote.filename))
+    urlretrieve(remote.url, file_path)
+    checksum = _sha256(file_path)
+    if remote.checksum != checksum:
+        raise IOError("{} has an SHA256 checksum ({}) "
+                      "differing from expected ({}), "
+                      "file may be corrupted.".format(file_path, checksum,
+                                                      remote.checksum))
+    return file_path
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
index a9f21510b0f01..cc5882ecb9cb9 100644
--- a/sklearn/datasets/california_housing.py
+++ b/sklearn/datasets/california_housing.py
@@ -21,33 +21,33 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-from io import BytesIO
 from os.path import exists
-from os import makedirs
+from os import makedirs, remove
 import tarfile
 
-try:
-    # Python 2
-    from urllib2 import urlopen
-except ImportError:
-    # Python 3+
-    from urllib.request import urlopen
-
 import numpy as np
+import logging
 
 from .base import get_data_home
-from ..utils import Bunch
+from .base import _fetch_remote
 from .base import _pkl_filepath
+from .base import RemoteFileMetadata
+from ..utils import Bunch
 from ..externals import joblib
 
-
-DATA_URL = "http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz"
-TARGET_FILENAME = "cal_housing.pkz"
+# The original data can be found at:
+# http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz
+ARCHIVE = RemoteFileMetadata(
+    filename='cal_housing.tgz',
+    url='https://ndownloader.figshare.com/files/5976036',
+    checksum=('aaa5c9a6afe2225cc2aed2723682ae40'
+              '3280c4a3695a2ddda4ffb5d8215ea681'))
 
 # Grab the module-level docstring to use as a description of the
 # dataset
 MODULE_DOCS = __doc__
 
+logger = logging.getLogger(__name__)
 
 def fetch_california_housing(data_home=None, download_if_missing=True):
     """Loader for the California housing dataset from StatLib.
@@ -89,17 +89,20 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
     if not exists(data_home):
         makedirs(data_home)
 
-    filepath = _pkl_filepath(data_home, TARGET_FILENAME)
+    filepath = _pkl_filepath(data_home, 'cal_housing.pkz')
     if not exists(filepath):
         if not download_if_missing:
             raise IOError("Data not found and `download_if_missing` is False")
 
-        print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
-        archive_fileobj = BytesIO(urlopen(DATA_URL).read())
+        logger.info('Downloading Cal. housing from {} to {}'.format(
+            ARCHIVE.url, data_home))
+        archive_path = _fetch_remote(ARCHIVE, dirname=data_home)
+
         fileobj = tarfile.open(
             mode="r:gz",
-            fileobj=archive_fileobj).extractfile(
+            name=archive_path).extractfile(
                 'CaliforniaHousing/cal_housing.data')
+        remove(archive_path)
 
         cal_housing = np.loadtxt(fileobj, delimiter=',')
         # Columns are not in the same order compared to the previous
diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
index a529e8579a7c0..c0c8f789975b1 100644
--- a/sklearn/datasets/covtype.py
+++ b/sklearn/datasets/covtype.py
@@ -15,29 +15,30 @@
 # License: BSD 3 clause
 
 from gzip import GzipFile
-from io import BytesIO
 import logging
 from os.path import exists, join
-try:
-    from urllib2 import urlopen
-except ImportError:
-    from urllib.request import urlopen
+from os import remove
 
 import numpy as np
 
 from .base import get_data_home
+from .base import _fetch_remote
+from .base import RemoteFileMetadata
 from ..utils import Bunch
 from .base import _pkl_filepath
 from ..utils.fixes import makedirs
 from ..externals import joblib
 from ..utils import check_random_state
 
+# The original data can be found in:
+# http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
+ARCHIVE = RemoteFileMetadata(
+    filename='covtype.data.gz',
+    url='https://ndownloader.figshare.com/files/5976039',
+    checksum=('614360d0257557dd1792834a85a1cdeb'
+              'fadc3c4f30b011d56afee7ffb5b15771'))
 
-URL = ('http://archive.ics.uci.edu/ml/'
-       'machine-learning-databases/covtype/covtype.data.gz')
-
-
-logger = logging.getLogger()
+logger = logging.getLogger(__name__)
 
 
 def fetch_covtype(data_home=None, download_if_missing=True,
@@ -91,19 +92,21 @@ def fetch_covtype(data_home=None, download_if_missing=True,
     if download_if_missing and not available:
         if not exists(covtype_dir):
             makedirs(covtype_dir)
-        logger.warning("Downloading %s" % URL)
-        f = BytesIO(urlopen(URL).read())
-        Xy = np.genfromtxt(GzipFile(fileobj=f), delimiter=',')
+        logger.info("Downloading %s" % ARCHIVE.url)
+
+        archive_path = _fetch_remote(ARCHIVE, dirname=covtype_dir)
+        Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=',')
+        # delete archive
+        remove(archive_path)
 
         X = Xy[:, :-1]
         y = Xy[:, -1].astype(np.int32)
 
         joblib.dump(X, samples_path, compress=9)
         joblib.dump(y, targets_path, compress=9)
-    elif not available:
-        if not download_if_missing:
-            raise IOError("Data not found and `download_if_missing` is False")
 
+    elif not available and not download_if_missing:
+        raise IOError("Data not found and `download_if_missing` is False")
     try:
         X, y
     except NameError:
diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py
index 6d52c5b6214b2..66cb58f3d9aea 100644
--- a/sklearn/datasets/kddcup99.py
+++ b/sklearn/datasets/kddcup99.py
@@ -11,32 +11,38 @@
 import sys
 import errno
 from gzip import GzipFile
-from io import BytesIO
 import logging
 import os
 from os.path import exists, join
-try:
-    from urllib2 import urlopen
-except ImportError:
-    from urllib.request import urlopen
 
 import numpy as np
 
+
+from .base import _fetch_remote
 from .base import get_data_home
+from .base import RemoteFileMetadata
 from ..utils import Bunch
 from ..externals import joblib, six
 from ..utils import check_random_state
 from ..utils import shuffle as shuffle_method
 
+# The original data can be found at:
+# http://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz
+ARCHIVE = RemoteFileMetadata(
+    filename='kddcup99_data',
+    url='https://ndownloader.figshare.com/files/5976045',
+    checksum=('3b6c942aa0356c0ca35b7b595a26c89d'
+              '343652c9db428893e7494f837b274292'))
 
-URL10 = ('http://archive.ics.uci.edu/ml/'
-         'machine-learning-databases/kddcup99-mld/kddcup.data_10_percent.gz')
-
-URL = ('http://archive.ics.uci.edu/ml/'
-       'machine-learning-databases/kddcup99-mld/kddcup.data.gz')
+# The original data can be found at:
+# http://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data_10_percent.gz
+ARCHIVE_10_PERCENT = RemoteFileMetadata(
+    filename='kddcup99_10_data',
+    url='https://ndownloader.figshare.com/files/5976042',
+    checksum=('8045aca0d84e70e622d1148d7df78249'
+              '6f6333bf6eb979a1b0837c42a9fd9561'))
 
-
-logger = logging.getLogger()
+logger = logging.getLogger(__name__)
 
 
 def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
@@ -273,20 +279,22 @@ def _fetch_brute_kddcup99(data_home=None,
     else:
         # Backward compat for Python 2 users
         dir_suffix = ""
+
     if percent10:
         kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix)
+        archive = ARCHIVE_10_PERCENT
     else:
         kddcup_dir = join(data_home, "kddcup99" + dir_suffix)
+        archive = ARCHIVE
+
     samples_path = join(kddcup_dir, "samples")
     targets_path = join(kddcup_dir, "targets")
     available = exists(samples_path)
 
     if download_if_missing and not available:
         _mkdirp(kddcup_dir)
-        URL_ = URL10 if percent10 else URL
-        logger.warning("Downloading %s" % URL_)
-        f = BytesIO(urlopen(URL_).read())
-
+        logger.info("Downloading %s" % archive.url)
+        _fetch_remote(archive, dirname=kddcup_dir)
         dt = [('duration', int),
               ('protocol_type', 'S4'),
               ('service', 'S11'),
@@ -330,15 +338,18 @@ def _fetch_brute_kddcup99(data_home=None,
               ('dst_host_srv_rerror_rate', float),
               ('labels', 'S16')]
         DT = np.dtype(dt)
-
-        file_ = GzipFile(fileobj=f, mode='r')
+        logger.debug("extracting archive")
+        archive_path = join(kddcup_dir, archive.filename)
+        file_ = GzipFile(filename=archive_path, mode='r')
         Xy = []
         for line in file_.readlines():
             if six.PY3:
                 line = line.decode()
             Xy.append(line.replace('\n', '').split(','))
         file_.close()
-        print('extraction done')
+        logger.debug('extraction done')
+        os.remove(archive_path)
+
         Xy = np.asarray(Xy, dtype=object)
         for j in range(42):
             Xy[:, j] = Xy[:, j].astype(DT[j])
diff --git a/sklearn/datasets/lfw.py b/sklearn/datasets/lfw.py
index 50834f7705ef6..0d5f56f189b45 100644
--- a/sklearn/datasets/lfw.py
+++ b/sklearn/datasets/lfw.py
@@ -23,18 +23,13 @@
 # Copyright (c) 2011 Olivier Grisel <olivier.grisel@ensta.org>
 # License: BSD 3 clause
 
-from os import listdir, makedirs, remove, rename
+from os import listdir, makedirs, remove
 from os.path import join, exists, isdir
 
 import logging
 import numpy as np
 
-try:
-    import urllib.request as urllib  # for backwards compatibility
-except ImportError:
-    import urllib
-
-from .base import get_data_home
+from .base import get_data_home, _fetch_remote, RemoteFileMetadata
 from ..utils import Bunch
 from ..externals.joblib import Memory
 
@@ -42,15 +37,45 @@
 
 logger = logging.getLogger(__name__)
 
-
-BASE_URL = "http://vis-www.cs.umass.edu/lfw/"
-ARCHIVE_NAME = "lfw.tgz"
-FUNNELED_ARCHIVE_NAME = "lfw-funneled.tgz"
-TARGET_FILENAMES = [
-    'pairsDevTrain.txt',
-    'pairsDevTest.txt',
-    'pairs.txt',
-]
+# The original data can be found in:
+# http://vis-www.cs.umass.edu/lfw/lfw.tgz
+ARCHIVE = RemoteFileMetadata(
+    filename='lfw.tgz',
+    url='https://ndownloader.figshare.com/files/5976018',
+    checksum=('055f7d9c632d7370e6fb4afc7468d40f'
+              '970c34a80d4c6f50ffec63f5a8d536c0'))
+
+# The original funneled data can be found in:
+# http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz
+FUNNELED_ARCHIVE = RemoteFileMetadata(
+    filename='lfw-funneled.tgz',
+    url='https://ndownloader.figshare.com/files/5976015',
+    checksum=('b47c8422c8cded889dc5a13418c4bc2a'
+              'bbda121092b3533a83306f90d900100a'))
+
+# The original target data can be found in:
+# http://vis-www.cs.umass.edu/lfw/pairsDevTrain.txt',
+# http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt',
+# http://vis-www.cs.umass.edu/lfw/pairs.txt',
+TARGETS = (
+    RemoteFileMetadata(
+        filename='pairsDevTrain.txt',
+        url='https://ndownloader.figshare.com/files/5976012',
+        checksum=('1d454dada7dfeca0e7eab6f65dc4e97a'
+                  '6312d44cf142207be28d688be92aabfa')),
+
+    RemoteFileMetadata(
+        filename='pairsDevTest.txt',
+        url='https://ndownloader.figshare.com/files/5976009',
+        checksum=('7cb06600ea8b2814ac26e946201cdb30'
+                  '4296262aad67d046a16a7ec85d0ff87c')),
+
+    RemoteFileMetadata(
+        filename='pairs.txt',
+        url='https://ndownloader.figshare.com/files/5976006',
+        checksum=('ea42330c62c92989f9d7c03237ed5d59'
+                  '1365e89b3e649747777b70e692dc1592')),
+)
 
 
 def scale_face(face):
@@ -71,42 +96,37 @@ def check_fetch_lfw(data_home=None, funneled=True, download_if_missing=True):
     data_home = get_data_home(data_home=data_home)
     lfw_home = join(data_home, "lfw_home")
 
-    if funneled:
-        archive_path = join(lfw_home, FUNNELED_ARCHIVE_NAME)
-        data_folder_path = join(lfw_home, "lfw_funneled")
-        archive_url = BASE_URL + FUNNELED_ARCHIVE_NAME
-    else:
-        archive_path = join(lfw_home, ARCHIVE_NAME)
-        data_folder_path = join(lfw_home, "lfw")
-        archive_url = BASE_URL + ARCHIVE_NAME
-
     if not exists(lfw_home):
         makedirs(lfw_home)
 
-    for target_filename in TARGET_FILENAMES:
-        target_filepath = join(lfw_home, target_filename)
+    for target in TARGETS:
+        target_filepath = join(lfw_home, target.filename)
         if not exists(target_filepath):
             if download_if_missing:
-                url = BASE_URL + target_filename
-                logger.warning("Downloading LFW metadata: %s", url)
-                urllib.urlretrieve(url, target_filepath)
+                logger.info("Downloading LFW metadata: %s", target.url)
+                _fetch_remote(target, dirname=lfw_home)
             else:
                 raise IOError("%s is missing" % target_filepath)
 
-    if not exists(data_folder_path):
+    if funneled:
+        data_folder_path = join(lfw_home, "lfw_funneled")
+        archive = FUNNELED_ARCHIVE
+    else:
+        data_folder_path = join(lfw_home, "lfw")
+        archive = ARCHIVE
 
+    if not exists(data_folder_path):
+        archive_path = join(lfw_home, archive.filename)
         if not exists(archive_path):
             if download_if_missing:
-                archive_path_temp = archive_path + ".tmp"
-                logger.warning("Downloading LFW data (~200MB): %s",
-                               archive_url)
-                urllib.urlretrieve(archive_url, archive_path_temp)
-                rename(archive_path_temp, archive_path)
+                logger.info("Downloading LFW data (~200MB): %s",
+                            archive.url)
+                _fetch_remote(archive, dirname=lfw_home)
             else:
-                raise IOError("%s is missing" % target_filepath)
+                raise IOError("%s is missing" % archive_path)
 
         import tarfile
-        logger.info("Decompressing the data archive to %s", data_folder_path)
+        logger.debug("Decompressing the data archive to %s", data_folder_path)
         tarfile.open(archive_path, "r:gz").extractall(path=lfw_home)
         remove(archive_path)
 
@@ -156,7 +176,7 @@ def _load_imgs(file_paths, slice_, color, resize):
     # arrays
     for i, file_path in enumerate(file_paths):
         if i % 1000 == 0:
-            logger.info("Loading face #%05d / %05d", i + 1, n_faces)
+            logger.debug("Loading face #%05d / %05d", i + 1, n_faces)
 
         # Checks if jpeg reading worked. Refer to issue #3594 for more
         # details.
@@ -301,7 +321,7 @@ def fetch_lfw_people(data_home=None, funneled=True, resize=0.5,
     lfw_home, data_folder_path = check_fetch_lfw(
         data_home=data_home, funneled=funneled,
         download_if_missing=download_if_missing)
-    logger.info('Loading LFW people faces from %s', lfw_home)
+    logger.debug('Loading LFW people faces from %s', lfw_home)
 
     # wrap the loader in a memoizing function that will return memmaped data
     # arrays for optimal memory usage
@@ -464,7 +484,7 @@ def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5,
     lfw_home, data_folder_path = check_fetch_lfw(
         data_home=data_home, funneled=funneled,
         download_if_missing=download_if_missing)
-    logger.info('Loading %s LFW pairs from %s', subset, lfw_home)
+    logger.debug('Loading %s LFW pairs from %s', subset, lfw_home)
 
     # wrap the loader in a memoizing function that will return memmaped data
     # arrays for optimal memory usage
diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py
index ac80d49e937d2..b71264c109d10 100644
--- a/sklearn/datasets/olivetti_faces.py
+++ b/sklearn/datasets/olivetti_faces.py
@@ -22,29 +22,26 @@
 # Copyright (c) 2011 David Warde-Farley <wardefar at iro dot umontreal dot ca>
 # License: BSD 3 clause
 
-from io import BytesIO
 from os.path import exists
-from os import makedirs
-try:
-    # Python 2
-    import urllib2
-    urlopen = urllib2.urlopen
-except ImportError:
-    # Python 3
-    import urllib.request
-    urlopen = urllib.request.urlopen
+from os import makedirs, remove
 
 import numpy as np
 from scipy.io.matlab import loadmat
 
 from .base import get_data_home
+from .base import _fetch_remote
+from .base import RemoteFileMetadata
 from .base import _pkl_filepath
 from ..utils import check_random_state, Bunch
 from ..externals import joblib
 
-
-DATA_URL = "http://cs.nyu.edu/~roweis/data/olivettifaces.mat"
-TARGET_FILENAME = "olivetti.pkz"
+# The original data can be found at:
+# http://cs.nyu.edu/~roweis/data/olivettifaces.mat
+FACES = RemoteFileMetadata(
+    filename='olivettifaces.mat',
+    url='https://ndownloader.figshare.com/files/5976027',
+    checksum=('b612fb967f2dc77c9c62d3e1266e0c73'
+              'd5fca46a4b8906c18e454d41af987794'))
 
 # Grab the module-level docstring to use as a description of the
 # dataset
@@ -113,16 +110,18 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
     data_home = get_data_home(data_home=data_home)
     if not exists(data_home):
         makedirs(data_home)
-    filepath = _pkl_filepath(data_home, TARGET_FILENAME)
+    filepath = _pkl_filepath(data_home, 'olivetti.pkz')
     if not exists(filepath):
         if not download_if_missing:
             raise IOError("Data not found and `download_if_missing` is False")
 
         print('downloading Olivetti faces from %s to %s'
-              % (DATA_URL, data_home))
-        fhandle = urlopen(DATA_URL)
-        buf = BytesIO(fhandle.read())
-        mfile = loadmat(buf)
+              % (FACES.url, data_home))
+        mat_path = _fetch_remote(FACES, dirname=data_home)
+        mfile = loadmat(file_name=mat_path)
+        # delete raw .mat data
+        remove(mat_path)
+
         faces = mfile['faces'].T.copy()
         joblib.dump(faces, filepath, compress=6)
         del mfile
diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py
index b3ecbe1d94e24..7c3d6d3edde76 100644
--- a/sklearn/datasets/rcv1.py
+++ b/sklearn/datasets/rcv1.py
@@ -6,21 +6,17 @@
 
 import logging
 
+from os import remove
 from os.path import exists, join
 from gzip import GzipFile
-from io import BytesIO
-from contextlib import closing
-
-try:
-    from urllib2 import urlopen
-except ImportError:
-    from urllib.request import urlopen
 
 import numpy as np
 import scipy.sparse as sp
 
 from .base import get_data_home
 from .base import _pkl_filepath
+from .base import _fetch_remote
+from .base import RemoteFileMetadata
 from ..utils.fixes import makedirs
 from ..externals import joblib
 from .svmlight_format import load_svmlight_files
@@ -28,12 +24,49 @@
 from ..utils import Bunch
 
 
-URL = ('http://jmlr.csail.mit.edu/papers/volume5/lewis04a/'
-       'a13-vector-files/lyrl2004_vectors')
-URL_topics = ('http://jmlr.csail.mit.edu/papers/volume5/lewis04a/'
-              'a08-topic-qrels/rcv1-v2.topics.qrels.gz')
-
-logger = logging.getLogger()
+# The original data can be found at:
+# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt0.dat.gz
+# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt1.dat.gz
+# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt2.dat.gz
+# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt3.dat.gz
+# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_train.dat.gz
+XY_METADATA = (
+    RemoteFileMetadata(
+        url='https://ndownloader.figshare.com/files/5976069',
+        checksum=('ed40f7e418d10484091b059703eeb95a'
+                  'e3199fe042891dcec4be6696b9968374'),
+        filename='lyrl2004_vectors_test_pt0.dat.gz'),
+    RemoteFileMetadata(
+        url='https://ndownloader.figshare.com/files/5976066',
+        checksum=('87700668ae45d45d5ca1ef6ae9bd81ab'
+                  '0f5ec88cc95dcef9ae7838f727a13aa6'),
+        filename='lyrl2004_vectors_test_pt1.dat.gz'),
+    RemoteFileMetadata(
+        url='https://ndownloader.figshare.com/files/5976063',
+        checksum=('48143ac703cbe33299f7ae9f4995db4'
+                  '9a258690f60e5debbff8995c34841c7f5'),
+        filename='lyrl2004_vectors_test_pt2.dat.gz'),
+    RemoteFileMetadata(
+        url='https://ndownloader.figshare.com/files/5976060',
+        checksum=('dfcb0d658311481523c6e6ca0c3f5a3'
+                  'e1d3d12cde5d7a8ce629a9006ec7dbb39'),
+        filename='lyrl2004_vectors_test_pt3.dat.gz'),
+    RemoteFileMetadata(
+        url='https://ndownloader.figshare.com/files/5976057',
+        checksum=('5468f656d0ba7a83afc7ad44841cf9a5'
+                  '3048a5c083eedc005dcdb5cc768924ae'),
+        filename='lyrl2004_vectors_train.dat.gz')
+)
+
+# The original data can be found at:
+# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz
+TOPICS_METADATA = RemoteFileMetadata(
+    url='https://ndownloader.figshare.com/files/5976048',
+    checksum=('2a98e5e5d8b770bded93afc8930d882'
+              '99474317fe14181aee1466cc754d0d1c1'),
+    filename='rcv1v2.topics.qrels.gz')
+
+logger = logging.getLogger(__name__)
 
 
 def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
@@ -125,19 +158,18 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
     # load data (X) and sample_id
     if download_if_missing and (not exists(samples_path) or
                                 not exists(sample_id_path)):
-        file_urls = ["%s_test_pt%d.dat.gz" % (URL, i) for i in range(4)]
-        file_urls.append("%s_train.dat.gz" % URL)
         files = []
-        for file_url in file_urls:
-            logger.warning("Downloading %s" % file_url)
-            with closing(urlopen(file_url)) as online_file:
-                # buffer the full file in memory to make possible to Gzip to
-                # work correctly
-                f = BytesIO(online_file.read())
-            files.append(GzipFile(fileobj=f))
+        for each in XY_METADATA:
+            logger.info("Downloading %s" % each.url)
+            file_path = _fetch_remote(each, dirname=rcv1_dir)
+            files.append(GzipFile(filename=file_path))
 
         Xy = load_svmlight_files(files, n_features=N_FEATURES)
 
+        # delete archives
+        for f in files:
+            remove(f.name)
+
         # Training data is before testing data
         X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr()
         sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7]))
@@ -145,7 +177,6 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
 
         joblib.dump(X, samples_path, compress=9)
         joblib.dump(sample_id, sample_id_path, compress=9)
-
     else:
         X = joblib.load(samples_path)
         sample_id = joblib.load(sample_id_path)
@@ -153,9 +184,9 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
     # load target (y), categories, and sample_id_bis
     if download_if_missing and (not exists(sample_topics_path) or
                                 not exists(topics_path)):
-        logger.warning("Downloading %s" % URL_topics)
-        with closing(urlopen(URL_topics)) as online_topics:
-            f = BytesIO(online_topics.read())
+        logger.info("Downloading %s" % TOPICS_METADATA.url)
+        topics_archive_path = _fetch_remote(TOPICS_METADATA,
+                                            dirname=rcv1_dir)
 
         # parse the target file
         n_cat = -1
@@ -164,7 +195,7 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
         y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8)
         sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32)
         category_names = {}
-        for line in GzipFile(fileobj=f, mode='rb'):
+        for line in GzipFile(filename=topics_archive_path, mode='rb'):
             line_components = line.decode("ascii").split(u" ")
             if len(line_components) == 3:
                 cat, doc, _ = line_components
@@ -179,6 +210,9 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
                     sample_id_bis[n_doc] = doc
                 y[n_doc, category_names[cat]] = 1
 
+        # delete archive
+        remove(topics_archive_path)
+
         # Samples in X are ordered with sample_id,
         # whereas in y, they are ordered with sample_id_bis.
         permutation = _find_permutation(sample_id_bis, sample_id)
@@ -196,7 +230,6 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
 
         joblib.dump(y, sample_topics_path, compress=9)
         joblib.dump(categories, topics_path, compress=9)
-
     else:
         y = joblib.load(sample_topics_path)
         categories = joblib.load(topics_path)
diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
index 556ad9ea45e05..1770889849209 100644
--- a/sklearn/datasets/species_distributions.py
+++ b/sklearn/datasets/species_distributions.py
@@ -38,33 +38,45 @@
 # License: BSD 3 clause
 
 from io import BytesIO
-from os import makedirs
+from os import makedirs, remove
 from os.path import exists
 
-try:
-    # Python 2
-    from urllib2 import urlopen
-    PY2 = True
-except ImportError:
-    # Python 3
-    from urllib.request import urlopen
-    PY2 = False
+import sys
 
+import logging
 import numpy as np
 
-from sklearn.datasets.base import get_data_home
+from .base import get_data_home
+from .base import _fetch_remote
+from .base import RemoteFileMetadata
 from ..utils import Bunch
 from sklearn.datasets.base import _pkl_filepath
 from sklearn.externals import joblib
 
-DIRECTORY_URL = "http://biodiversityinformatics.amnh.org/open_source/maxent/"
+PY3_OR_LATER = sys.version_info[0] >= 3
 
-SAMPLES_URL = DIRECTORY_URL + "samples.zip"
-COVERAGES_URL = DIRECTORY_URL + "coverages.zip"
+# The original data can be found at:
+# http://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip
+SAMPLES = RemoteFileMetadata(
+    filename='samples.zip',
+    url='https://ndownloader.figshare.com/files/5976075',
+    checksum=('abb07ad284ac50d9e6d20f1c4211e0fd'
+              '3c098f7f85955e89d321ee8efe37ac28'))
+
+# The original data can be found at:
+# http://biodiversityinformatics.amnh.org/open_source/maxent/coverages.zip
+COVERAGES = RemoteFileMetadata(
+    filename='coverages.zip',
+    url='https://ndownloader.figshare.com/files/5976078',
+    checksum=('4d862674d72e79d6cee77e63b98651ec'
+              '7926043ba7d39dcb31329cf3f6073807'))
 
 DATA_ARCHIVE_NAME = "species_coverage.pkz"
 
 
+logger = logging.getLogger(__name__)
+
+
 def _load_coverage(F, header_length=6, dtype=np.int16):
     """Load a coverage file from an open file object.
 
@@ -94,12 +106,13 @@ def _load_csv(F):
     rec : np.ndarray
         record array representing the data
     """
-    if PY2:
-        # Numpy recarray wants Python 2 str but not unicode
-        names = F.readline().strip().split(',')
-    else:
+    if PY3_OR_LATER:
         # Numpy recarray wants Python 3 str but not bytes...
         names = F.readline().decode('ascii').strip().split(',')
+    else:
+        # Numpy recarray wants Python 2 str but not unicode
+        names = F.readline().strip().split(',')
+
     rec = np.loadtxt(F, skiprows=0, delimiter=',', dtype='a22,f4,f4')
     rec.dtype.names = names
     return rec
@@ -227,10 +240,11 @@ def fetch_species_distributions(data_home=None,
     if not exists(archive_path):
         if not download_if_missing:
             raise IOError("Data not found and `download_if_missing` is False")
-
-        print('Downloading species data from %s to %s' % (SAMPLES_URL,
-                                                          data_home))
-        X = np.load(BytesIO(urlopen(SAMPLES_URL).read()))
+        logger.info('Downloading species data from %s to %s' % (
+            SAMPLES.url, data_home))
+        samples_path = _fetch_remote(SAMPLES, dirname=data_home)
+        X = np.load(samples_path)  # samples.zip is a valid npz
+        remove(samples_path)
 
         for f in X.files:
             fhandle = BytesIO(X[f])
@@ -239,15 +253,16 @@ def fetch_species_distributions(data_home=None,
             if 'test' in f:
                 test = _load_csv(fhandle)
 
-        print('Downloading coverage data from %s to %s' % (COVERAGES_URL,
-                                                           data_home))
-
-        X = np.load(BytesIO(urlopen(COVERAGES_URL).read()))
+        logger.info('Downloading coverage data from %s to %s' % (
+            COVERAGES.url, data_home))
+        coverages_path = _fetch_remote(COVERAGES, dirname=data_home)
+        X = np.load(coverages_path)  # coverages.zip is a valid npz
+        remove(coverages_path)
 
         coverages = []
         for f in X.files:
             fhandle = BytesIO(X[f])
-            print(' - converting', f)
+            logger.debug(' - converting {}'.format(f))
             coverages.append(_load_coverage(fhandle))
         coverages = np.asarray(coverages, dtype=dtype)
 
diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py
index 47b543d8d2e16..73025966ab072 100644
--- a/sklearn/datasets/twenty_newsgroups.py
+++ b/sklearn/datasets/twenty_newsgroups.py
@@ -49,23 +49,23 @@
 from .base import get_data_home
 from .base import load_files
 from .base import _pkl_filepath
+from .base import _fetch_remote
+from .base import RemoteFileMetadata
 from ..utils import check_random_state, Bunch
 from ..feature_extraction.text import CountVectorizer
 from ..preprocessing import normalize
-from ..externals import joblib, six
-
-if six.PY3:
-    from urllib.request import urlopen
-else:
-    from urllib2 import urlopen
-
+from ..externals import joblib
 
 logger = logging.getLogger(__name__)
 
+# The original data can be found at:
+# http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz
+ARCHIVE = RemoteFileMetadata(
+    filename='20news-bydate.tar.gz',
+    url='https://ndownloader.figshare.com/files/5975967',
+    checksum=('8f1b2514ca22a5ade8fbb9cfa5727df9'
+              '5fa587f4c87b786e15c759fa66d95610'))
 
-URL = ("http://people.csail.mit.edu/jrennie/"
-       "20Newsgroups/20news-bydate.tar.gz")
-ARCHIVE_NAME = "20news-bydate.tar.gz"
 CACHE_NAME = "20news-bydate.pkz"
 TRAIN_FOLDER = "20news-bydate-train"
 TEST_FOLDER = "20news-bydate-test"
@@ -73,25 +73,16 @@
 
 def download_20newsgroups(target_dir, cache_path):
     """Download the 20 newsgroups data and stored it as a zipped pickle."""
-    archive_path = os.path.join(target_dir, ARCHIVE_NAME)
     train_path = os.path.join(target_dir, TRAIN_FOLDER)
     test_path = os.path.join(target_dir, TEST_FOLDER)
 
     if not os.path.exists(target_dir):
         os.makedirs(target_dir)
 
-    if os.path.exists(archive_path):
-        # Download is not complete as the .tar.gz file is removed after
-        # download.
-        logger.warning("Download was incomplete, downloading again.")
-        os.remove(archive_path)
-
-    logger.warning("Downloading dataset from %s (14 MB)", URL)
-    opener = urlopen(URL)
-    with open(archive_path, 'wb') as f:
-        f.write(opener.read())
+    logger.info("Downloading dataset from %s (14 MB)", ARCHIVE.url)
+    archive_path = _fetch_remote(ARCHIVE, dirname=target_dir)
 
-    logger.info("Decompressing %s", archive_path)
+    logger.debug("Decompressing %s", archive_path)
     tarfile.open(archive_path, "r:gz").extractall(path=target_dir)
     os.remove(archive_path)