scikit-learn
diff --git a/‎sklearn/datasets/base.py
Lines changed: 85 additions & 3 deletions b/‎sklearn/datasets/base.py
Lines changed: 85 additions & 3 deletions
diff --git a/‎sklearn/datasets/california_housing.py
Lines changed: 12 additions & 12 deletions b/‎sklearn/datasets/california_housing.py
Lines changed: 12 additions & 12 deletions
diff --git a/‎sklearn/datasets/covtype.py
Lines changed: 17 additions & 12 deletions b/‎sklearn/datasets/covtype.py
Lines changed: 17 additions & 12 deletions
diff --git a/‎sklearn/datasets/kddcup99.py
Lines changed: 34 additions & 14 deletions b/‎sklearn/datasets/kddcup99.py
Lines changed: 34 additions & 14 deletions
@@ -6,6 +6,7 @@
 #               2010 Fabian Pedregosa <[email protected]>
 #               2010 Olivier Grisel <[email protected]>
 # License: BSD 3 clause
+from __future__ import print_function
 
 import os
 import csv
@@ -18,10 +19,16 @@
 from os.path import expanduser
 from os.path import isdir
 from os.path import splitext
-from os import listdir
-from os import makedirs
+from os.path import getsize
+from os import listdir, makedirs, rename, remove
+
+try:
+    import urllib.request as urllib  # for backwards compatibility
+except ImportError:
+    import urllib
 
 import numpy as np
+import hashlib
 
 from ..utils import check_random_state
 
@@ -606,7 +613,7 @@ def load_boston(return_X_y=False):
 
     (data, target) : tuple if ``return_X_y`` is True
 
-        .. versionadded:: 0.18    
+        .. versionadded:: 0.18
 
     Examples
     --------
@@ -762,3 +769,78 @@ def _pkl_filepath(*args, **kwargs):
         basename += py3_suffix
     new_args = args[:-1] + (basename + ext,)
     return join(*new_args)
+
+
+class partialURLOpener(urllib.FancyURLopener):
+    """
+    Override HTTP Error 206 (partial file being sent)
+    """
+    def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
+        # Ignore the expected "error" code
+        pass
+
+
+def md5(path):
+    md5hash = hashlib.md5()
+    md5hash.update(open(path, 'rb').read())
+    return md5hash.hexdigest()
+
+def validate_file_md5(expected_checksum, path):
+    if expected_checksum != md5(path):
+        remove(path)
+        raise ValueError("{} has an MD5 hash differing "
+                         "from expected, file may be "
+                         "corrupted.".format(path))
+
+
+def fetch_and_verify_dataset(URL, path, checksum):
+    """
+    Fetch a dataset from a URL and check the MD5 checksum to ensure
+    fetch was completed and the correct file was downloaded
+
+    Parameters
+    -----------
+    URL: String
+        URL to fetch the download from.
+
+    path: String
+        Path to save the file to.
+
+    checksum: String
+        MD5 checksum to verify against the data
+    """
+
+    existing_size = 0
+    resume_url_downloader = partialURLOpener()
+    path_temp = path + ".tmp"
+    if exists(path_temp):
+        # since path_temp exists, resume download
+        temp_file = open(path_temp,"ab")
+        # get the amount of path_temp we've downloaded
+        existing_size = getsize(path_temp)
+        print("Resuming download from previous temp file, "
+              "already have {} bytes".format(existing_size))
+        # Download only the remainder of the file
+        resume_url_downloader.addheader("Range","bytes={}-".format(existing_size))
+    else:
+        # no path_temp, so download from scratch
+        temp_file= open(path_temp,"wb")
+
+    dataset_url = resume_url_downloader.open(URL)
+    while 1:
+        chunk = dataset_url.read(8192)
+        if not chunk:
+            break
+        temp_file.write(chunk)
+
+    dataset_url.close()
+    temp_file.close()
+    # verify checksum of downloaded temp file
+    print("verifying checksum")
+    if checksum != md5(path_temp):
+        remove(path_temp)
+        raise ValueError("Downloaded file had an MD5 hash differing "
+                         "from expected, file could have been corrupted.")
+    print("done verifying checksum")
+    # move temporary file to the expected location
+    rename(path_temp, path)
@@ -21,21 +21,14 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-from io import BytesIO
-from os.path import exists
-from os import makedirs
+from os.path import exists, join
+from os import makedirs, remove
 import tarfile
 
-try:
-    # Python 2
-    from urllib2 import urlopen
-except ImportError:
-    # Python 3+
-    from urllib.request import urlopen
-
 import numpy as np
 
 from .base import get_data_home, Bunch
+from .base import fetch_and_verify_dataset, validate_file_md5
 from .base import _pkl_filepath
 from ..externals import joblib
 
@@ -90,18 +83,25 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
     filepath = _pkl_filepath(data_home, TARGET_FILENAME)
     if not exists(filepath):
         print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
-        archive_fileobj = BytesIO(urlopen(DATA_URL).read())
+        archive_path = join(data_home, "cal_housing.tgz")
+        expected_checksum = "130d0eececf165046ec4dc621d121d80"
+        fetch_and_verify_dataset(DATA_URL, archive_path, expected_checksum)
         fileobj = tarfile.open(
             mode="r:gz",
-            fileobj=archive_fileobj).extractfile(
+            name=archive_path).extractfile(
                 'CaliforniaHousing/cal_housing.data')
+        remove(archive_path)
 
         cal_housing = np.loadtxt(fileobj, delimiter=',')
         # Columns are not in the same order compared to the previous
         # URL resource on lib.stat.cmu.edu
         columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
         cal_housing = cal_housing[:, columns_index]
         joblib.dump(cal_housing, filepath, compress=6)
+        # assert that dumped file has correct md5 hash
+        expected_checksum = "39c2dc70c4aad72e44b741c37163e6cc"
+        validate_file_md5(expected_checksum, filepath)
+
     else:
         cal_housing = joblib.load(filepath)
 
 
@@ -15,18 +15,14 @@
 # License: BSD 3 clause
 
 from gzip import GzipFile
-from io import BytesIO
 import logging
 from os.path import exists, join
-try:
-    from urllib2 import urlopen
-except ImportError:
-    from urllib.request import urlopen
+from os import remove
 
 import numpy as np
 
-from .base import get_data_home
-from .base import Bunch
+from .base import get_data_home, Bunch
+from .base import fetch_and_verify_dataset, validate_file_md5
 from .base import _pkl_filepath
 from ..utils.fixes import makedirs
 from ..externals import joblib
@@ -35,8 +31,7 @@
 
 URL = 'https://ndownloader.figshare.com/files/5976039'
 
-
-logger = logging.getLogger()
+logger = logging.getLogger(__name__)
 
 
 def fetch_covtype(data_home=None, download_if_missing=True,
@@ -89,16 +84,26 @@ def fetch_covtype(data_home=None, download_if_missing=True,
 
     if download_if_missing and not available:
         makedirs(covtype_dir, exist_ok=True)
-        logger.warning("Downloading %s" % URL)
-        f = BytesIO(urlopen(URL).read())
-        Xy = np.genfromtxt(GzipFile(fileobj=f), delimiter=',')
+        logger.info("Downloading %s" % URL)
+
+        archive_path = join(covtype_dir, "covtype.data.gz")
+        expected_checksum = "99670d8d942f09d459c7d4486fca8af5"
+        fetch_and_verify_dataset(URL, archive_path, expected_checksum)
+        Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=',')
+        # delete archive
+        remove(archive_path)
 
         X = Xy[:, :-1]
         y = Xy[:, -1].astype(np.int32)
 
         joblib.dump(X, samples_path, compress=9)
         joblib.dump(y, targets_path, compress=9)
+        # check md5 of dumped samples and targets
+        expected_samples_checksum = "19b80d5fa6590346b357b4cb75562f0e"
+        validate_file_md5(expected_samples_checksum, samples_path)
 
+        expected_targets_checksum = "b79a24223e6a55bd486b7f796e8e5305"
+        validate_file_md5(expected_targets_checksum, targets_path)
     try:
         X, y
     except NameError:
 
@@ -11,19 +11,14 @@
 import sys
 import errno
 from gzip import GzipFile
-from io import BytesIO
 import logging
 import os
 from os.path import exists, join
-try:
-    from urllib2 import urlopen
-except ImportError:
-    from urllib.request import urlopen
 
 import numpy as np
 
-from .base import get_data_home
-from .base import Bunch
+from .base import get_data_home, Bunch
+from .base import fetch_and_verify_dataset, validate_file_md5
 from ..externals import joblib, six
 from ..utils import check_random_state
 from ..utils import shuffle as shuffle_method
@@ -33,7 +28,7 @@
 
 URL = 'https://ndownloader.figshare.com/files/5976045'
 
-
+logging.basicConfig()
 logger = logging.getLogger()
 
 
@@ -264,18 +259,23 @@ def _fetch_brute_kddcup99(subset=None, data_home=None,
         dir_suffix = ""
     if percent10:
         kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix)
+        archive_path = join(kddcup_dir, "kddcup99_10_data")
+        expected_checksum = "c421989ff187d340c1265ac3080a3229"
     else:
         kddcup_dir = join(data_home, "kddcup99" + dir_suffix)
+        archive_path = join(kddcup_dir, "kddcup99_data")
+        expected_checksum = "3745289f84bdd907c03baca24f9f81bc"
+
     samples_path = join(kddcup_dir, "samples")
     targets_path = join(kddcup_dir, "targets")
     available = exists(samples_path)
 
     if download_if_missing and not available:
         _mkdirp(kddcup_dir)
         URL_ = URL10 if percent10 else URL
-        logger.warning("Downloading %s" % URL_)
-        f = BytesIO(urlopen(URL_).read())
-
+        logger.info("Downloading %s" % URL_)
+        fetch_and_verify_dataset(URL_, archive_path, expected_checksum)
+        print "before dt"
         dt = [('duration', int),
               ('protocol_type', 'S4'),
               ('service', 'S11'),
@@ -319,15 +319,20 @@ def _fetch_brute_kddcup99(subset=None, data_home=None,
               ('dst_host_srv_rerror_rate', float),
               ('labels', 'S16')]
         DT = np.dtype(dt)
-
-        file_ = GzipFile(fileobj=f, mode='r')
+        print "after dt"
+        print "extracting archive"
+        logger.info("extracting archive")
+        file_ = GzipFile(filename=archive_path, mode='r')
         Xy = []
         for line in file_.readlines():
             if six.PY3:
                 line = line.decode()
             Xy.append(line.replace('\n', '').split(','))
         file_.close()
-        print('extraction done')
+        print "extraction done"
+        logger.info('extraction done')
+        os.remove(archive_path)
+
         Xy = np.asarray(Xy, dtype=object)
         for j in range(42):
             Xy[:, j] = Xy[:, j].astype(DT[j])
@@ -338,8 +343,23 @@ def _fetch_brute_kddcup99(subset=None, data_home=None,
         # (error: 'Incorrect data length while decompressing[...] the file
         #  could be corrupted.')
 
+        print "dumping joblib"
         joblib.dump(X, samples_path, compress=0)
         joblib.dump(y, targets_path, compress=0)
+        # check md5 of dumped samples and targets
+        if percent10:
+            expected_samples_checksum = "md1b292b59b96894de38da4a984df2a483"
+            validate_file_md5(expected_samples_checksum, samples_path)
+
+            expected_targets_checksum = "956a3e4d5ea62aedeb226fd104798dc9"
+            validate_file_md5(expected_targets_checksum, targets_path)
+
+        else:
+            expected_samples_checksum = "7b6f71d4557254f26d73e52d2b39b46e"
+            validate_file_md5(expected_samples_checksum, samples_path)
+
+            expected_targets_checksum = "0422b093c0bc5bf60b586c8060698ef3"
+            validate_file_md5(expected_targets_checksum, targets_path)
 
     try:
         X, y