Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d23e1d2

Browse files
committed
check md5 of datasets and add resume functionality to downloads
1 parent b5298da commit d23e1d2

File tree

9 files changed

+262
-110
lines changed

9 files changed

+262
-110
lines changed

sklearn/datasets/base.py

Lines changed: 85 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# 2010 Fabian Pedregosa <[email protected]>
77
# 2010 Olivier Grisel <[email protected]>
88
# License: BSD 3 clause
9+
from __future__ import print_function
910

1011
import os
1112
import csv
@@ -18,10 +19,16 @@
1819
from os.path import expanduser
1920
from os.path import isdir
2021
from os.path import splitext
21-
from os import listdir
22-
from os import makedirs
22+
from os.path import getsize
23+
from os import listdir, makedirs, rename, remove
24+
25+
try:
26+
import urllib.request as urllib # for backwards compatibility
27+
except ImportError:
28+
import urllib
2329

2430
import numpy as np
31+
import hashlib
2532

2633
from ..utils import check_random_state
2734

@@ -606,7 +613,7 @@ def load_boston(return_X_y=False):
606613
607614
(data, target) : tuple if ``return_X_y`` is True
608615
609-
.. versionadded:: 0.18
616+
.. versionadded:: 0.18
610617
611618
Examples
612619
--------
@@ -762,3 +769,78 @@ def _pkl_filepath(*args, **kwargs):
762769
basename += py3_suffix
763770
new_args = args[:-1] + (basename + ext,)
764771
return join(*new_args)
772+
773+
774+
class partialURLOpener(urllib.FancyURLopener):
775+
"""
776+
Override HTTP Error 206 (partial file being sent)
777+
"""
778+
def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
779+
# Ignore the expected "error" code
780+
pass
781+
782+
783+
def md5(path):
784+
md5hash = hashlib.md5()
785+
md5hash.update(open(path, 'rb').read())
786+
return md5hash.hexdigest()
787+
788+
def validate_file_md5(expected_checksum, path):
789+
if expected_checksum != md5(path):
790+
remove(path)
791+
raise ValueError("{} has an MD5 hash differing "
792+
"from expected, file may be "
793+
"corrupted.".format(path))
794+
795+
796+
def fetch_and_verify_dataset(URL, path, checksum):
797+
"""
798+
Fetch a dataset from a URL and check the MD5 checksum to ensure
799+
fetch was completed and the correct file was downloaded
800+
801+
Parameters
802+
-----------
803+
URL: String
804+
URL to fetch the download from.
805+
806+
path: String
807+
Path to save the file to.
808+
809+
checksum: String
810+
MD5 checksum to verify against the data
811+
"""
812+
813+
existing_size = 0
814+
resume_url_downloader = partialURLOpener()
815+
path_temp = path + ".tmp"
816+
if exists(path_temp):
817+
# since path_temp exists, resume download
818+
temp_file = open(path_temp,"ab")
819+
# get the amount of path_temp we've downloaded
820+
existing_size = getsize(path_temp)
821+
print("Resuming download from previous temp file, "
822+
"already have {} bytes".format(existing_size))
823+
# Download only the remainder of the file
824+
resume_url_downloader.addheader("Range","bytes={}-".format(existing_size))
825+
else:
826+
# no path_temp, so download from scratch
827+
temp_file= open(path_temp,"wb")
828+
829+
dataset_url = resume_url_downloader.open(URL)
830+
while 1:
831+
chunk = dataset_url.read(8192)
832+
if not chunk:
833+
break
834+
temp_file.write(chunk)
835+
836+
dataset_url.close()
837+
temp_file.close()
838+
# verify checksum of downloaded temp file
839+
print("verifying checksum")
840+
if checksum != md5(path_temp):
841+
remove(path_temp)
842+
raise ValueError("Downloaded file had an MD5 hash differing "
843+
"from expected, file could have been corrupted.")
844+
print("done verifying checksum")
845+
# move temporary file to the expected location
846+
rename(path_temp, path)

sklearn/datasets/california_housing.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,14 @@
2121
# Authors: Peter Prettenhofer
2222
# License: BSD 3 clause
2323

24-
from io import BytesIO
25-
from os.path import exists
26-
from os import makedirs
24+
from os.path import exists, join
25+
from os import makedirs, remove
2726
import tarfile
2827

29-
try:
30-
# Python 2
31-
from urllib2 import urlopen
32-
except ImportError:
33-
# Python 3+
34-
from urllib.request import urlopen
35-
3628
import numpy as np
3729

3830
from .base import get_data_home, Bunch
31+
from .base import fetch_and_verify_dataset, validate_file_md5
3932
from .base import _pkl_filepath
4033
from ..externals import joblib
4134

@@ -90,18 +83,25 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
9083
filepath = _pkl_filepath(data_home, TARGET_FILENAME)
9184
if not exists(filepath):
9285
print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
93-
archive_fileobj = BytesIO(urlopen(DATA_URL).read())
86+
archive_path = join(data_home, "cal_housing.tgz")
87+
expected_checksum = "130d0eececf165046ec4dc621d121d80"
88+
fetch_and_verify_dataset(DATA_URL, archive_path, expected_checksum)
9489
fileobj = tarfile.open(
9590
mode="r:gz",
96-
fileobj=archive_fileobj).extractfile(
91+
name=archive_path).extractfile(
9792
'CaliforniaHousing/cal_housing.data')
93+
remove(archive_path)
9894

9995
cal_housing = np.loadtxt(fileobj, delimiter=',')
10096
# Columns are not in the same order compared to the previous
10197
# URL resource on lib.stat.cmu.edu
10298
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
10399
cal_housing = cal_housing[:, columns_index]
104100
joblib.dump(cal_housing, filepath, compress=6)
101+
# assert that dumped file has correct md5 hash
102+
expected_checksum = "39c2dc70c4aad72e44b741c37163e6cc"
103+
validate_file_md5(expected_checksum, filepath)
104+
105105
else:
106106
cal_housing = joblib.load(filepath)
107107

sklearn/datasets/covtype.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,14 @@
1515
# License: BSD 3 clause
1616

1717
from gzip import GzipFile
18-
from io import BytesIO
1918
import logging
2019
from os.path import exists, join
21-
try:
22-
from urllib2 import urlopen
23-
except ImportError:
24-
from urllib.request import urlopen
20+
from os import remove
2521

2622
import numpy as np
2723

28-
from .base import get_data_home
29-
from .base import Bunch
24+
from .base import get_data_home, Bunch
25+
from .base import fetch_and_verify_dataset, validate_file_md5
3026
from .base import _pkl_filepath
3127
from ..utils.fixes import makedirs
3228
from ..externals import joblib
@@ -35,8 +31,7 @@
3531

3632
URL = 'https://ndownloader.figshare.com/files/5976039'
3733

38-
39-
logger = logging.getLogger()
34+
logger = logging.getLogger(__name__)
4035

4136

4237
def fetch_covtype(data_home=None, download_if_missing=True,
@@ -89,16 +84,26 @@ def fetch_covtype(data_home=None, download_if_missing=True,
8984

9085
if download_if_missing and not available:
9186
makedirs(covtype_dir, exist_ok=True)
92-
logger.warning("Downloading %s" % URL)
93-
f = BytesIO(urlopen(URL).read())
94-
Xy = np.genfromtxt(GzipFile(fileobj=f), delimiter=',')
87+
logger.info("Downloading %s" % URL)
88+
89+
archive_path = join(covtype_dir, "covtype.data.gz")
90+
expected_checksum = "99670d8d942f09d459c7d4486fca8af5"
91+
fetch_and_verify_dataset(URL, archive_path, expected_checksum)
92+
Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=',')
93+
# delete archive
94+
remove(archive_path)
9595

9696
X = Xy[:, :-1]
9797
y = Xy[:, -1].astype(np.int32)
9898

9999
joblib.dump(X, samples_path, compress=9)
100100
joblib.dump(y, targets_path, compress=9)
101+
# check md5 of dumped samples and targets
102+
expected_samples_checksum = "19b80d5fa6590346b357b4cb75562f0e"
103+
validate_file_md5(expected_samples_checksum, samples_path)
101104

105+
expected_targets_checksum = "b79a24223e6a55bd486b7f796e8e5305"
106+
validate_file_md5(expected_targets_checksum, targets_path)
102107
try:
103108
X, y
104109
except NameError:

sklearn/datasets/kddcup99.py

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,14 @@
1111
import sys
1212
import errno
1313
from gzip import GzipFile
14-
from io import BytesIO
1514
import logging
1615
import os
1716
from os.path import exists, join
18-
try:
19-
from urllib2 import urlopen
20-
except ImportError:
21-
from urllib.request import urlopen
2217

2318
import numpy as np
2419

25-
from .base import get_data_home
26-
from .base import Bunch
20+
from .base import get_data_home, Bunch
21+
from .base import fetch_and_verify_dataset, validate_file_md5
2722
from ..externals import joblib, six
2823
from ..utils import check_random_state
2924
from ..utils import shuffle as shuffle_method
@@ -33,7 +28,7 @@
3328

3429
URL = 'https://ndownloader.figshare.com/files/5976045'
3530

36-
31+
logging.basicConfig()
3732
logger = logging.getLogger()
3833

3934

@@ -264,18 +259,23 @@ def _fetch_brute_kddcup99(subset=None, data_home=None,
264259
dir_suffix = ""
265260
if percent10:
266261
kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix)
262+
archive_path = join(kddcup_dir, "kddcup99_10_data")
263+
expected_checksum = "c421989ff187d340c1265ac3080a3229"
267264
else:
268265
kddcup_dir = join(data_home, "kddcup99" + dir_suffix)
266+
archive_path = join(kddcup_dir, "kddcup99_data")
267+
expected_checksum = "3745289f84bdd907c03baca24f9f81bc"
268+
269269
samples_path = join(kddcup_dir, "samples")
270270
targets_path = join(kddcup_dir, "targets")
271271
available = exists(samples_path)
272272

273273
if download_if_missing and not available:
274274
_mkdirp(kddcup_dir)
275275
URL_ = URL10 if percent10 else URL
276-
logger.warning("Downloading %s" % URL_)
277-
f = BytesIO(urlopen(URL_).read())
278-
276+
logger.info("Downloading %s" % URL_)
277+
fetch_and_verify_dataset(URL_, archive_path, expected_checksum)
278+
print "before dt"
279279
dt = [('duration', int),
280280
('protocol_type', 'S4'),
281281
('service', 'S11'),
@@ -319,15 +319,20 @@ def _fetch_brute_kddcup99(subset=None, data_home=None,
319319
('dst_host_srv_rerror_rate', float),
320320
('labels', 'S16')]
321321
DT = np.dtype(dt)
322-
323-
file_ = GzipFile(fileobj=f, mode='r')
322+
print "after dt"
323+
print "extracting archive"
324+
logger.info("extracting archive")
325+
file_ = GzipFile(filename=archive_path, mode='r')
324326
Xy = []
325327
for line in file_.readlines():
326328
if six.PY3:
327329
line = line.decode()
328330
Xy.append(line.replace('\n', '').split(','))
329331
file_.close()
330-
print('extraction done')
332+
print "extraction done"
333+
logger.info('extraction done')
334+
os.remove(archive_path)
335+
331336
Xy = np.asarray(Xy, dtype=object)
332337
for j in range(42):
333338
Xy[:, j] = Xy[:, j].astype(DT[j])
@@ -338,8 +343,23 @@ def _fetch_brute_kddcup99(subset=None, data_home=None,
338343
# (error: 'Incorrect data length while decompressing[...] the file
339344
# could be corrupted.')
340345

346+
print "dumping joblib"
341347
joblib.dump(X, samples_path, compress=0)
342348
joblib.dump(y, targets_path, compress=0)
349+
# check md5 of dumped samples and targets
350+
if percent10:
351+
expected_samples_checksum = "md1b292b59b96894de38da4a984df2a483"
352+
validate_file_md5(expected_samples_checksum, samples_path)
353+
354+
expected_targets_checksum = "956a3e4d5ea62aedeb226fd104798dc9"
355+
validate_file_md5(expected_targets_checksum, targets_path)
356+
357+
else:
358+
expected_samples_checksum = "7b6f71d4557254f26d73e52d2b39b46e"
359+
validate_file_md5(expected_samples_checksum, samples_path)
360+
361+
expected_targets_checksum = "0422b093c0bc5bf60b586c8060698ef3"
362+
validate_file_md5(expected_targets_checksum, targets_path)
343363

344364
try:
345365
X, y

0 commit comments

Comments
 (0)