diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 3ca2a6814e70b..e74466efd8a95 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -17,6 +17,11 @@ import warnings import os from contextlib import contextmanager as _contextmanager +import logging + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler()) +logger.setLevel(logging.INFO) _ASSUME_FINITE = bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 1441daf838032..3d5ceb0a7abff 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -6,39 +6,40 @@ # 2010 Fabian Pedregosa # 2010 Olivier Grisel # License: BSD 3 clause +from __future__ import print_function import os import csv import sys import shutil -from os import environ -from os.path import dirname -from os.path import join -from os.path import exists -from os.path import expanduser -from os.path import isdir -from os.path import splitext -from os import listdir -from os import makedirs +from collections import namedtuple +from os import environ, listdir, makedirs +from os.path import dirname, exists, expanduser, isdir, join, splitext +import hashlib + from ..utils import Bunch +from ..utils import check_random_state import numpy as np -from ..utils import check_random_state +from sklearn.externals.six.moves.urllib.request import urlretrieve + +RemoteFileMetadata = namedtuple('RemoteFileMetadata', + ['filename', 'url', 'checksum']) def get_data_home(data_home=None): """Return the path of the scikit-learn data dir. - This folder is used by some large dataset loaders to avoid - downloading the data several times. + This folder is used by some large dataset loaders to avoid downloading the + data several times. - By default the data dir is set to a folder named 'scikit_learn_data' - in the user home folder. + By default the data dir is set to a folder named 'scikit_learn_data' in the + user home folder. Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment - variable or programmatically by giving an explicit folder path. The - '~' symbol is expanded to the user home folder. + variable or programmatically by giving an explicit folder path. The '~' + symbol is expanded to the user home folder. If the folder does not already exist, it is automatically created. """ @@ -76,23 +77,22 @@ def load_files(container_path, description=None, categories=None, file_44.txt ... - The folder names are used as supervised signal label names. The - individual file names are not important. + The folder names are used as supervised signal label names. The individual + file names are not important. - This function does not try to extract features into a numpy array or - scipy sparse matrix. In addition, if load_content is false it - does not try to load the files in memory. + This function does not try to extract features into a numpy array or scipy + sparse matrix. In addition, if load_content is false it does not try to + load the files in memory. - To use text files in a scikit-learn classification or clustering - algorithm, you will need to use the `sklearn.feature_extraction.text` - module to build a feature extraction transformer that suits your - problem. + To use text files in a scikit-learn classification or clustering algorithm, + you will need to use the `sklearn.feature_extraction.text` module to build + a feature extraction transformer that suits your problem. - If you set load_content=True, you should also specify the encoding of - the text using the 'encoding' parameter. For many modern text files, - 'utf-8' will be the correct encoding. If you leave encoding equal to None, - then the content will be made of bytes instead of Unicode, and you will - not be able to use most functions in `sklearn.feature_extraction.text`. + If you set load_content=True, you should also specify the encoding of the + text using the 'encoding' parameter. For many modern text files, 'utf-8' + will be the correct encoding. If you leave encoding equal to None, then the + content will be made of bytes instead of Unicode, and you will not be able + to use most functions in `sklearn.feature_extraction.text`. Similar feature extractors should be built for other kind of unstructured data input such as images, audio, video, ... @@ -109,20 +109,19 @@ def load_files(container_path, description=None, categories=None, reference, etc. categories : A collection of strings or None, optional (default=None) - If None (default), load all the categories. - If not None, list of category names to load (other categories ignored). + If None (default), load all the categories. If not None, list of + category names to load (other categories ignored). load_content : boolean, optional (default=True) - Whether to load or not the content of the different files. If - true a 'data' attribute containing the text information is present - in the data structure returned. If not, a filenames attribute - gives the path to the files. + Whether to load or not the content of the different files. If true a + 'data' attribute containing the text information is present in the data + structure returned. If not, a filenames attribute gives the path to the + files. encoding : string or None (default is None) - If None, do not try to decode the content of the files (e.g. for - images or other non-text content). - If not None, encoding to use to decode text files to Unicode if - load_content is True. + If None, do not try to decode the content of the files (e.g. for images + or other non-text content). If not None, encoding to use to decode text + files to Unicode if load_content is True. decode_error : {'strict', 'ignore', 'replace'}, optional Instruction on what to do if a byte sequence is given to analyze that @@ -262,16 +261,15 @@ def load_wine(return_X_y=False): Returns ------- data : Bunch - Dictionary-like object, the interesting attributes are: - 'data', the data to learn, 'target', the classification labels, - 'target_names', the meaning of the labels, 'feature_names', the - meaning of the features, and 'DESCR', the - full description of the dataset. + Dictionary-like object, the interesting attributes are: 'data', the + data to learn, 'target', the classification labels, 'target_names', the + meaning of the labels, 'feature_names', the meaning of the features, + and 'DESCR', the full description of the dataset. (data, target) : tuple if ``return_X_y`` is True - The copy of UCI ML Wine Data Set dataset is - downloaded and modified to fit standard format from: + The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit + standard format from: https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data Examples @@ -332,8 +330,8 @@ def load_iris(return_X_y=False): Parameters ---------- return_X_y : boolean, default=False. - If True, returns ``(data, target)`` instead of a Bunch object. - See below for more information about the `data` and `target` object. + If True, returns ``(data, target)`` instead of a Bunch object. See + below for more information about the `data` and `target` object. .. versionadded:: 0.18 @@ -709,15 +707,15 @@ def load_boston(return_X_y=False): def load_sample_images(): """Load sample images for image manipulation. + Loads both, ``china`` and ``flower``. Returns ------- data : Bunch - Dictionary-like object with the following attributes : - 'images', the two sample images, 'filenames', the file - names for the images, and 'DESCR' - the full description of the dataset. + Dictionary-like object with the following attributes : 'images', the + two sample images, 'filenames', the file names for the images, and + 'DESCR' the full description of the dataset. Examples -------- @@ -799,18 +797,18 @@ def load_sample_image(image_name): def _pkl_filepath(*args, **kwargs): """Ensure different filenames for Python 2 and Python 3 pickles - An object pickled under Python 3 cannot be loaded under Python 2. - An object pickled under Python 2 can sometimes not be loaded - correctly under Python 3 because some Python 2 strings are decoded as - Python 3 strings which can be problematic for objects that use Python 2 - strings as byte buffers for numerical data instead of "real" strings. + An object pickled under Python 3 cannot be loaded under Python 2. An object + pickled under Python 2 can sometimes not be loaded correctly under Python 3 + because some Python 2 strings are decoded as Python 3 strings which can be + problematic for objects that use Python 2 strings as byte buffers for + numerical data instead of "real" strings. Therefore, dataset loaders in scikit-learn use different files for pickles - manages by Python 2 and Python 3 in the same SCIKIT_LEARN_DATA folder so - as to avoid conflicts. + manages by Python 2 and Python 3 in the same SCIKIT_LEARN_DATA folder so as + to avoid conflicts. - args[-1] is expected to be the ".pkl" filename. Under Python 3, a - suffix is inserted before the extension to s + args[-1] is expected to be the ".pkl" filename. Under Python 3, a suffix is + inserted before the extension to s _pkl_filepath('/path/to/folder', 'filename.pkl') returns: - /path/to/folder/filename.pkl under Python 2 @@ -823,3 +821,50 @@ def _pkl_filepath(*args, **kwargs): basename += py3_suffix new_args = args[:-1] + (basename + ext,) return join(*new_args) + + +def _sha256(path): + """Calculate the sha256 hash of the file at path.""" + sha256hash = hashlib.sha256() + chunk_size = 8192 + with open(path, "rb") as f: + while True: + buffer = f.read(chunk_size) + if not buffer: + break + sha256hash.update(buffer) + return sha256hash.hexdigest() + + +def _fetch_remote(remote, dirname=None): + """Helper function to download a remote dataset into path + + Fetch a dataset pointed by remote's url, save into path using remote's + filename and ensure its integrity based on the SHA256 Checksum of the + downloaded file. + + Parameters + ----------- + remote : RemoteFileMetadata + Named tuple containing remote dataset meta information: url, filename + and checksum + + dirname : string + Directory to save the file to. + + Returns + ------- + file_path: string + Full path of the created file. + """ + + file_path = (remote.filename if dirname is None + else join(dirname, remote.filename)) + urlretrieve(remote.url, file_path) + checksum = _sha256(file_path) + if remote.checksum != checksum: + raise IOError("{} has an SHA256 checksum ({}) " + "differing from expected ({}), " + "file may be corrupted.".format(file_path, checksum, + remote.checksum)) + return file_path diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py index a9f21510b0f01..cc5882ecb9cb9 100644 --- a/sklearn/datasets/california_housing.py +++ b/sklearn/datasets/california_housing.py @@ -21,33 +21,33 @@ # Authors: Peter Prettenhofer # License: BSD 3 clause -from io import BytesIO from os.path import exists -from os import makedirs +from os import makedirs, remove import tarfile -try: - # Python 2 - from urllib2 import urlopen -except ImportError: - # Python 3+ - from urllib.request import urlopen - import numpy as np +import logging from .base import get_data_home -from ..utils import Bunch +from .base import _fetch_remote from .base import _pkl_filepath +from .base import RemoteFileMetadata +from ..utils import Bunch from ..externals import joblib - -DATA_URL = "http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz" -TARGET_FILENAME = "cal_housing.pkz" +# The original data can be found at: +# http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz +ARCHIVE = RemoteFileMetadata( + filename='cal_housing.tgz', + url='https://ndownloader.figshare.com/files/5976036', + checksum=('aaa5c9a6afe2225cc2aed2723682ae40' + '3280c4a3695a2ddda4ffb5d8215ea681')) # Grab the module-level docstring to use as a description of the # dataset MODULE_DOCS = __doc__ +logger = logging.getLogger(__name__) def fetch_california_housing(data_home=None, download_if_missing=True): """Loader for the California housing dataset from StatLib. @@ -89,17 +89,20 @@ def fetch_california_housing(data_home=None, download_if_missing=True): if not exists(data_home): makedirs(data_home) - filepath = _pkl_filepath(data_home, TARGET_FILENAME) + filepath = _pkl_filepath(data_home, 'cal_housing.pkz') if not exists(filepath): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") - print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home)) - archive_fileobj = BytesIO(urlopen(DATA_URL).read()) + logger.info('Downloading Cal. housing from {} to {}'.format( + ARCHIVE.url, data_home)) + archive_path = _fetch_remote(ARCHIVE, dirname=data_home) + fileobj = tarfile.open( mode="r:gz", - fileobj=archive_fileobj).extractfile( + name=archive_path).extractfile( 'CaliforniaHousing/cal_housing.data') + remove(archive_path) cal_housing = np.loadtxt(fileobj, delimiter=',') # Columns are not in the same order compared to the previous diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py index a529e8579a7c0..c0c8f789975b1 100644 --- a/sklearn/datasets/covtype.py +++ b/sklearn/datasets/covtype.py @@ -15,29 +15,30 @@ # License: BSD 3 clause from gzip import GzipFile -from io import BytesIO import logging from os.path import exists, join -try: - from urllib2 import urlopen -except ImportError: - from urllib.request import urlopen +from os import remove import numpy as np from .base import get_data_home +from .base import _fetch_remote +from .base import RemoteFileMetadata from ..utils import Bunch from .base import _pkl_filepath from ..utils.fixes import makedirs from ..externals import joblib from ..utils import check_random_state +# The original data can be found in: +# http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz +ARCHIVE = RemoteFileMetadata( + filename='covtype.data.gz', + url='https://ndownloader.figshare.com/files/5976039', + checksum=('614360d0257557dd1792834a85a1cdeb' + 'fadc3c4f30b011d56afee7ffb5b15771')) -URL = ('http://archive.ics.uci.edu/ml/' - 'machine-learning-databases/covtype/covtype.data.gz') - - -logger = logging.getLogger() +logger = logging.getLogger(__name__) def fetch_covtype(data_home=None, download_if_missing=True, @@ -91,19 +92,21 @@ def fetch_covtype(data_home=None, download_if_missing=True, if download_if_missing and not available: if not exists(covtype_dir): makedirs(covtype_dir) - logger.warning("Downloading %s" % URL) - f = BytesIO(urlopen(URL).read()) - Xy = np.genfromtxt(GzipFile(fileobj=f), delimiter=',') + logger.info("Downloading %s" % ARCHIVE.url) + + archive_path = _fetch_remote(ARCHIVE, dirname=covtype_dir) + Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=',') + # delete archive + remove(archive_path) X = Xy[:, :-1] y = Xy[:, -1].astype(np.int32) joblib.dump(X, samples_path, compress=9) joblib.dump(y, targets_path, compress=9) - elif not available: - if not download_if_missing: - raise IOError("Data not found and `download_if_missing` is False") + elif not available and not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") try: X, y except NameError: diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py index 6d52c5b6214b2..66cb58f3d9aea 100644 --- a/sklearn/datasets/kddcup99.py +++ b/sklearn/datasets/kddcup99.py @@ -11,32 +11,38 @@ import sys import errno from gzip import GzipFile -from io import BytesIO import logging import os from os.path import exists, join -try: - from urllib2 import urlopen -except ImportError: - from urllib.request import urlopen import numpy as np + +from .base import _fetch_remote from .base import get_data_home +from .base import RemoteFileMetadata from ..utils import Bunch from ..externals import joblib, six from ..utils import check_random_state from ..utils import shuffle as shuffle_method +# The original data can be found at: +# http://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz +ARCHIVE = RemoteFileMetadata( + filename='kddcup99_data', + url='https://ndownloader.figshare.com/files/5976045', + checksum=('3b6c942aa0356c0ca35b7b595a26c89d' + '343652c9db428893e7494f837b274292')) -URL10 = ('http://archive.ics.uci.edu/ml/' - 'machine-learning-databases/kddcup99-mld/kddcup.data_10_percent.gz') - -URL = ('http://archive.ics.uci.edu/ml/' - 'machine-learning-databases/kddcup99-mld/kddcup.data.gz') +# The original data can be found at: +# http://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data_10_percent.gz +ARCHIVE_10_PERCENT = RemoteFileMetadata( + filename='kddcup99_10_data', + url='https://ndownloader.figshare.com/files/5976042', + checksum=('8045aca0d84e70e622d1148d7df78249' + '6f6333bf6eb979a1b0837c42a9fd9561')) - -logger = logging.getLogger() +logger = logging.getLogger(__name__) def fetch_kddcup99(subset=None, data_home=None, shuffle=False, @@ -273,20 +279,22 @@ def _fetch_brute_kddcup99(data_home=None, else: # Backward compat for Python 2 users dir_suffix = "" + if percent10: kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix) + archive = ARCHIVE_10_PERCENT else: kddcup_dir = join(data_home, "kddcup99" + dir_suffix) + archive = ARCHIVE + samples_path = join(kddcup_dir, "samples") targets_path = join(kddcup_dir, "targets") available = exists(samples_path) if download_if_missing and not available: _mkdirp(kddcup_dir) - URL_ = URL10 if percent10 else URL - logger.warning("Downloading %s" % URL_) - f = BytesIO(urlopen(URL_).read()) - + logger.info("Downloading %s" % archive.url) + _fetch_remote(archive, dirname=kddcup_dir) dt = [('duration', int), ('protocol_type', 'S4'), ('service', 'S11'), @@ -330,15 +338,18 @@ def _fetch_brute_kddcup99(data_home=None, ('dst_host_srv_rerror_rate', float), ('labels', 'S16')] DT = np.dtype(dt) - - file_ = GzipFile(fileobj=f, mode='r') + logger.debug("extracting archive") + archive_path = join(kddcup_dir, archive.filename) + file_ = GzipFile(filename=archive_path, mode='r') Xy = [] for line in file_.readlines(): if six.PY3: line = line.decode() Xy.append(line.replace('\n', '').split(',')) file_.close() - print('extraction done') + logger.debug('extraction done') + os.remove(archive_path) + Xy = np.asarray(Xy, dtype=object) for j in range(42): Xy[:, j] = Xy[:, j].astype(DT[j]) diff --git a/sklearn/datasets/lfw.py b/sklearn/datasets/lfw.py index 50834f7705ef6..0d5f56f189b45 100644 --- a/sklearn/datasets/lfw.py +++ b/sklearn/datasets/lfw.py @@ -23,18 +23,13 @@ # Copyright (c) 2011 Olivier Grisel # License: BSD 3 clause -from os import listdir, makedirs, remove, rename +from os import listdir, makedirs, remove from os.path import join, exists, isdir import logging import numpy as np -try: - import urllib.request as urllib # for backwards compatibility -except ImportError: - import urllib - -from .base import get_data_home +from .base import get_data_home, _fetch_remote, RemoteFileMetadata from ..utils import Bunch from ..externals.joblib import Memory @@ -42,15 +37,45 @@ logger = logging.getLogger(__name__) - -BASE_URL = "http://vis-www.cs.umass.edu/lfw/" -ARCHIVE_NAME = "lfw.tgz" -FUNNELED_ARCHIVE_NAME = "lfw-funneled.tgz" -TARGET_FILENAMES = [ - 'pairsDevTrain.txt', - 'pairsDevTest.txt', - 'pairs.txt', -] +# The original data can be found in: +# http://vis-www.cs.umass.edu/lfw/lfw.tgz +ARCHIVE = RemoteFileMetadata( + filename='lfw.tgz', + url='https://ndownloader.figshare.com/files/5976018', + checksum=('055f7d9c632d7370e6fb4afc7468d40f' + '970c34a80d4c6f50ffec63f5a8d536c0')) + +# The original funneled data can be found in: +# http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz +FUNNELED_ARCHIVE = RemoteFileMetadata( + filename='lfw-funneled.tgz', + url='https://ndownloader.figshare.com/files/5976015', + checksum=('b47c8422c8cded889dc5a13418c4bc2a' + 'bbda121092b3533a83306f90d900100a')) + +# The original target data can be found in: +# http://vis-www.cs.umass.edu/lfw/pairsDevTrain.txt', +# http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt', +# http://vis-www.cs.umass.edu/lfw/pairs.txt', +TARGETS = ( + RemoteFileMetadata( + filename='pairsDevTrain.txt', + url='https://ndownloader.figshare.com/files/5976012', + checksum=('1d454dada7dfeca0e7eab6f65dc4e97a' + '6312d44cf142207be28d688be92aabfa')), + + RemoteFileMetadata( + filename='pairsDevTest.txt', + url='https://ndownloader.figshare.com/files/5976009', + checksum=('7cb06600ea8b2814ac26e946201cdb30' + '4296262aad67d046a16a7ec85d0ff87c')), + + RemoteFileMetadata( + filename='pairs.txt', + url='https://ndownloader.figshare.com/files/5976006', + checksum=('ea42330c62c92989f9d7c03237ed5d59' + '1365e89b3e649747777b70e692dc1592')), +) def scale_face(face): @@ -71,42 +96,37 @@ def check_fetch_lfw(data_home=None, funneled=True, download_if_missing=True): data_home = get_data_home(data_home=data_home) lfw_home = join(data_home, "lfw_home") - if funneled: - archive_path = join(lfw_home, FUNNELED_ARCHIVE_NAME) - data_folder_path = join(lfw_home, "lfw_funneled") - archive_url = BASE_URL + FUNNELED_ARCHIVE_NAME - else: - archive_path = join(lfw_home, ARCHIVE_NAME) - data_folder_path = join(lfw_home, "lfw") - archive_url = BASE_URL + ARCHIVE_NAME - if not exists(lfw_home): makedirs(lfw_home) - for target_filename in TARGET_FILENAMES: - target_filepath = join(lfw_home, target_filename) + for target in TARGETS: + target_filepath = join(lfw_home, target.filename) if not exists(target_filepath): if download_if_missing: - url = BASE_URL + target_filename - logger.warning("Downloading LFW metadata: %s", url) - urllib.urlretrieve(url, target_filepath) + logger.info("Downloading LFW metadata: %s", target.url) + _fetch_remote(target, dirname=lfw_home) else: raise IOError("%s is missing" % target_filepath) - if not exists(data_folder_path): + if funneled: + data_folder_path = join(lfw_home, "lfw_funneled") + archive = FUNNELED_ARCHIVE + else: + data_folder_path = join(lfw_home, "lfw") + archive = ARCHIVE + if not exists(data_folder_path): + archive_path = join(lfw_home, archive.filename) if not exists(archive_path): if download_if_missing: - archive_path_temp = archive_path + ".tmp" - logger.warning("Downloading LFW data (~200MB): %s", - archive_url) - urllib.urlretrieve(archive_url, archive_path_temp) - rename(archive_path_temp, archive_path) + logger.info("Downloading LFW data (~200MB): %s", + archive.url) + _fetch_remote(archive, dirname=lfw_home) else: - raise IOError("%s is missing" % target_filepath) + raise IOError("%s is missing" % archive_path) import tarfile - logger.info("Decompressing the data archive to %s", data_folder_path) + logger.debug("Decompressing the data archive to %s", data_folder_path) tarfile.open(archive_path, "r:gz").extractall(path=lfw_home) remove(archive_path) @@ -156,7 +176,7 @@ def _load_imgs(file_paths, slice_, color, resize): # arrays for i, file_path in enumerate(file_paths): if i % 1000 == 0: - logger.info("Loading face #%05d / %05d", i + 1, n_faces) + logger.debug("Loading face #%05d / %05d", i + 1, n_faces) # Checks if jpeg reading worked. Refer to issue #3594 for more # details. @@ -301,7 +321,7 @@ def fetch_lfw_people(data_home=None, funneled=True, resize=0.5, lfw_home, data_folder_path = check_fetch_lfw( data_home=data_home, funneled=funneled, download_if_missing=download_if_missing) - logger.info('Loading LFW people faces from %s', lfw_home) + logger.debug('Loading LFW people faces from %s', lfw_home) # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage @@ -464,7 +484,7 @@ def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5, lfw_home, data_folder_path = check_fetch_lfw( data_home=data_home, funneled=funneled, download_if_missing=download_if_missing) - logger.info('Loading %s LFW pairs from %s', subset, lfw_home) + logger.debug('Loading %s LFW pairs from %s', subset, lfw_home) # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py index ac80d49e937d2..b71264c109d10 100644 --- a/sklearn/datasets/olivetti_faces.py +++ b/sklearn/datasets/olivetti_faces.py @@ -22,29 +22,26 @@ # Copyright (c) 2011 David Warde-Farley # License: BSD 3 clause -from io import BytesIO from os.path import exists -from os import makedirs -try: - # Python 2 - import urllib2 - urlopen = urllib2.urlopen -except ImportError: - # Python 3 - import urllib.request - urlopen = urllib.request.urlopen +from os import makedirs, remove import numpy as np from scipy.io.matlab import loadmat from .base import get_data_home +from .base import _fetch_remote +from .base import RemoteFileMetadata from .base import _pkl_filepath from ..utils import check_random_state, Bunch from ..externals import joblib - -DATA_URL = "http://cs.nyu.edu/~roweis/data/olivettifaces.mat" -TARGET_FILENAME = "olivetti.pkz" +# The original data can be found at: +# http://cs.nyu.edu/~roweis/data/olivettifaces.mat +FACES = RemoteFileMetadata( + filename='olivettifaces.mat', + url='https://ndownloader.figshare.com/files/5976027', + checksum=('b612fb967f2dc77c9c62d3e1266e0c73' + 'd5fca46a4b8906c18e454d41af987794')) # Grab the module-level docstring to use as a description of the # dataset @@ -113,16 +110,18 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0, data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) - filepath = _pkl_filepath(data_home, TARGET_FILENAME) + filepath = _pkl_filepath(data_home, 'olivetti.pkz') if not exists(filepath): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") print('downloading Olivetti faces from %s to %s' - % (DATA_URL, data_home)) - fhandle = urlopen(DATA_URL) - buf = BytesIO(fhandle.read()) - mfile = loadmat(buf) + % (FACES.url, data_home)) + mat_path = _fetch_remote(FACES, dirname=data_home) + mfile = loadmat(file_name=mat_path) + # delete raw .mat data + remove(mat_path) + faces = mfile['faces'].T.copy() joblib.dump(faces, filepath, compress=6) del mfile diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py index b3ecbe1d94e24..7c3d6d3edde76 100644 --- a/sklearn/datasets/rcv1.py +++ b/sklearn/datasets/rcv1.py @@ -6,21 +6,17 @@ import logging +from os import remove from os.path import exists, join from gzip import GzipFile -from io import BytesIO -from contextlib import closing - -try: - from urllib2 import urlopen -except ImportError: - from urllib.request import urlopen import numpy as np import scipy.sparse as sp from .base import get_data_home from .base import _pkl_filepath +from .base import _fetch_remote +from .base import RemoteFileMetadata from ..utils.fixes import makedirs from ..externals import joblib from .svmlight_format import load_svmlight_files @@ -28,12 +24,49 @@ from ..utils import Bunch -URL = ('http://jmlr.csail.mit.edu/papers/volume5/lewis04a/' - 'a13-vector-files/lyrl2004_vectors') -URL_topics = ('http://jmlr.csail.mit.edu/papers/volume5/lewis04a/' - 'a08-topic-qrels/rcv1-v2.topics.qrels.gz') - -logger = logging.getLogger() +# The original data can be found at: +# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt0.dat.gz +# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt1.dat.gz +# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt2.dat.gz +# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt3.dat.gz +# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_train.dat.gz +XY_METADATA = ( + RemoteFileMetadata( + url='https://ndownloader.figshare.com/files/5976069', + checksum=('ed40f7e418d10484091b059703eeb95a' + 'e3199fe042891dcec4be6696b9968374'), + filename='lyrl2004_vectors_test_pt0.dat.gz'), + RemoteFileMetadata( + url='https://ndownloader.figshare.com/files/5976066', + checksum=('87700668ae45d45d5ca1ef6ae9bd81ab' + '0f5ec88cc95dcef9ae7838f727a13aa6'), + filename='lyrl2004_vectors_test_pt1.dat.gz'), + RemoteFileMetadata( + url='https://ndownloader.figshare.com/files/5976063', + checksum=('48143ac703cbe33299f7ae9f4995db4' + '9a258690f60e5debbff8995c34841c7f5'), + filename='lyrl2004_vectors_test_pt2.dat.gz'), + RemoteFileMetadata( + url='https://ndownloader.figshare.com/files/5976060', + checksum=('dfcb0d658311481523c6e6ca0c3f5a3' + 'e1d3d12cde5d7a8ce629a9006ec7dbb39'), + filename='lyrl2004_vectors_test_pt3.dat.gz'), + RemoteFileMetadata( + url='https://ndownloader.figshare.com/files/5976057', + checksum=('5468f656d0ba7a83afc7ad44841cf9a5' + '3048a5c083eedc005dcdb5cc768924ae'), + filename='lyrl2004_vectors_train.dat.gz') +) + +# The original data can be found at: +# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz +TOPICS_METADATA = RemoteFileMetadata( + url='https://ndownloader.figshare.com/files/5976048', + checksum=('2a98e5e5d8b770bded93afc8930d882' + '99474317fe14181aee1466cc754d0d1c1'), + filename='rcv1v2.topics.qrels.gz') + +logger = logging.getLogger(__name__) def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, @@ -125,19 +158,18 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, # load data (X) and sample_id if download_if_missing and (not exists(samples_path) or not exists(sample_id_path)): - file_urls = ["%s_test_pt%d.dat.gz" % (URL, i) for i in range(4)] - file_urls.append("%s_train.dat.gz" % URL) files = [] - for file_url in file_urls: - logger.warning("Downloading %s" % file_url) - with closing(urlopen(file_url)) as online_file: - # buffer the full file in memory to make possible to Gzip to - # work correctly - f = BytesIO(online_file.read()) - files.append(GzipFile(fileobj=f)) + for each in XY_METADATA: + logger.info("Downloading %s" % each.url) + file_path = _fetch_remote(each, dirname=rcv1_dir) + files.append(GzipFile(filename=file_path)) Xy = load_svmlight_files(files, n_features=N_FEATURES) + # delete archives + for f in files: + remove(f.name) + # Training data is before testing data X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr() sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7])) @@ -145,7 +177,6 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, joblib.dump(X, samples_path, compress=9) joblib.dump(sample_id, sample_id_path, compress=9) - else: X = joblib.load(samples_path) sample_id = joblib.load(sample_id_path) @@ -153,9 +184,9 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, # load target (y), categories, and sample_id_bis if download_if_missing and (not exists(sample_topics_path) or not exists(topics_path)): - logger.warning("Downloading %s" % URL_topics) - with closing(urlopen(URL_topics)) as online_topics: - f = BytesIO(online_topics.read()) + logger.info("Downloading %s" % TOPICS_METADATA.url) + topics_archive_path = _fetch_remote(TOPICS_METADATA, + dirname=rcv1_dir) # parse the target file n_cat = -1 @@ -164,7 +195,7 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8) sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32) category_names = {} - for line in GzipFile(fileobj=f, mode='rb'): + for line in GzipFile(filename=topics_archive_path, mode='rb'): line_components = line.decode("ascii").split(u" ") if len(line_components) == 3: cat, doc, _ = line_components @@ -179,6 +210,9 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, sample_id_bis[n_doc] = doc y[n_doc, category_names[cat]] = 1 + # delete archive + remove(topics_archive_path) + # Samples in X are ordered with sample_id, # whereas in y, they are ordered with sample_id_bis. permutation = _find_permutation(sample_id_bis, sample_id) @@ -196,7 +230,6 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, joblib.dump(y, sample_topics_path, compress=9) joblib.dump(categories, topics_path, compress=9) - else: y = joblib.load(sample_topics_path) categories = joblib.load(topics_path) diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py index 556ad9ea45e05..1770889849209 100644 --- a/sklearn/datasets/species_distributions.py +++ b/sklearn/datasets/species_distributions.py @@ -38,33 +38,45 @@ # License: BSD 3 clause from io import BytesIO -from os import makedirs +from os import makedirs, remove from os.path import exists -try: - # Python 2 - from urllib2 import urlopen - PY2 = True -except ImportError: - # Python 3 - from urllib.request import urlopen - PY2 = False +import sys +import logging import numpy as np -from sklearn.datasets.base import get_data_home +from .base import get_data_home +from .base import _fetch_remote +from .base import RemoteFileMetadata from ..utils import Bunch from sklearn.datasets.base import _pkl_filepath from sklearn.externals import joblib -DIRECTORY_URL = "http://biodiversityinformatics.amnh.org/open_source/maxent/" +PY3_OR_LATER = sys.version_info[0] >= 3 -SAMPLES_URL = DIRECTORY_URL + "samples.zip" -COVERAGES_URL = DIRECTORY_URL + "coverages.zip" +# The original data can be found at: +# http://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip +SAMPLES = RemoteFileMetadata( + filename='samples.zip', + url='https://ndownloader.figshare.com/files/5976075', + checksum=('abb07ad284ac50d9e6d20f1c4211e0fd' + '3c098f7f85955e89d321ee8efe37ac28')) + +# The original data can be found at: +# http://biodiversityinformatics.amnh.org/open_source/maxent/coverages.zip +COVERAGES = RemoteFileMetadata( + filename='coverages.zip', + url='https://ndownloader.figshare.com/files/5976078', + checksum=('4d862674d72e79d6cee77e63b98651ec' + '7926043ba7d39dcb31329cf3f6073807')) DATA_ARCHIVE_NAME = "species_coverage.pkz" +logger = logging.getLogger(__name__) + + def _load_coverage(F, header_length=6, dtype=np.int16): """Load a coverage file from an open file object. @@ -94,12 +106,13 @@ def _load_csv(F): rec : np.ndarray record array representing the data """ - if PY2: - # Numpy recarray wants Python 2 str but not unicode - names = F.readline().strip().split(',') - else: + if PY3_OR_LATER: # Numpy recarray wants Python 3 str but not bytes... names = F.readline().decode('ascii').strip().split(',') + else: + # Numpy recarray wants Python 2 str but not unicode + names = F.readline().strip().split(',') + rec = np.loadtxt(F, skiprows=0, delimiter=',', dtype='a22,f4,f4') rec.dtype.names = names return rec @@ -227,10 +240,11 @@ def fetch_species_distributions(data_home=None, if not exists(archive_path): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") - - print('Downloading species data from %s to %s' % (SAMPLES_URL, - data_home)) - X = np.load(BytesIO(urlopen(SAMPLES_URL).read())) + logger.info('Downloading species data from %s to %s' % ( + SAMPLES.url, data_home)) + samples_path = _fetch_remote(SAMPLES, dirname=data_home) + X = np.load(samples_path) # samples.zip is a valid npz + remove(samples_path) for f in X.files: fhandle = BytesIO(X[f]) @@ -239,15 +253,16 @@ def fetch_species_distributions(data_home=None, if 'test' in f: test = _load_csv(fhandle) - print('Downloading coverage data from %s to %s' % (COVERAGES_URL, - data_home)) - - X = np.load(BytesIO(urlopen(COVERAGES_URL).read())) + logger.info('Downloading coverage data from %s to %s' % ( + COVERAGES.url, data_home)) + coverages_path = _fetch_remote(COVERAGES, dirname=data_home) + X = np.load(coverages_path) # coverages.zip is a valid npz + remove(coverages_path) coverages = [] for f in X.files: fhandle = BytesIO(X[f]) - print(' - converting', f) + logger.debug(' - converting {}'.format(f)) coverages.append(_load_coverage(fhandle)) coverages = np.asarray(coverages, dtype=dtype) diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py index 47b543d8d2e16..73025966ab072 100644 --- a/sklearn/datasets/twenty_newsgroups.py +++ b/sklearn/datasets/twenty_newsgroups.py @@ -49,23 +49,23 @@ from .base import get_data_home from .base import load_files from .base import _pkl_filepath +from .base import _fetch_remote +from .base import RemoteFileMetadata from ..utils import check_random_state, Bunch from ..feature_extraction.text import CountVectorizer from ..preprocessing import normalize -from ..externals import joblib, six - -if six.PY3: - from urllib.request import urlopen -else: - from urllib2 import urlopen - +from ..externals import joblib logger = logging.getLogger(__name__) +# The original data can be found at: +# http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz +ARCHIVE = RemoteFileMetadata( + filename='20news-bydate.tar.gz', + url='https://ndownloader.figshare.com/files/5975967', + checksum=('8f1b2514ca22a5ade8fbb9cfa5727df9' + '5fa587f4c87b786e15c759fa66d95610')) -URL = ("http://people.csail.mit.edu/jrennie/" - "20Newsgroups/20news-bydate.tar.gz") -ARCHIVE_NAME = "20news-bydate.tar.gz" CACHE_NAME = "20news-bydate.pkz" TRAIN_FOLDER = "20news-bydate-train" TEST_FOLDER = "20news-bydate-test" @@ -73,25 +73,16 @@ def download_20newsgroups(target_dir, cache_path): """Download the 20 newsgroups data and stored it as a zipped pickle.""" - archive_path = os.path.join(target_dir, ARCHIVE_NAME) train_path = os.path.join(target_dir, TRAIN_FOLDER) test_path = os.path.join(target_dir, TEST_FOLDER) if not os.path.exists(target_dir): os.makedirs(target_dir) - if os.path.exists(archive_path): - # Download is not complete as the .tar.gz file is removed after - # download. - logger.warning("Download was incomplete, downloading again.") - os.remove(archive_path) - - logger.warning("Downloading dataset from %s (14 MB)", URL) - opener = urlopen(URL) - with open(archive_path, 'wb') as f: - f.write(opener.read()) + logger.info("Downloading dataset from %s (14 MB)", ARCHIVE.url) + archive_path = _fetch_remote(ARCHIVE, dirname=target_dir) - logger.info("Decompressing %s", archive_path) + logger.debug("Decompressing %s", archive_path) tarfile.open(archive_path, "r:gz").extractall(path=target_dir) os.remove(archive_path)