Thanks to visit codestin.com
Credit goes to github.com

Skip to content

ENH Add retry mechanism to fetch_xx functions. #28160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
de729ca
Add arguments to fetch_xx functions. Add test for _fetch_remote.
fkdosilovic Jan 17, 2024
2f51fe5
Update whats_new/v1.5.
fkdosilovic Jan 17, 2024
3253e64
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Jan 18, 2024
d3af6b7
Minor update.
fkdosilovic Jan 18, 2024
9f08743
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Jan 20, 2024
7e0ec02
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Jan 22, 2024
5d73acc
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Jan 29, 2024
20fcb2a
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Feb 3, 2024
9d57e5e
Update sklearn/datasets/_base.py
fkdosilovic Feb 3, 2024
76ddb72
Update.
fkdosilovic Feb 3, 2024
abfb598
Fix linting issues.
fkdosilovic Feb 3, 2024
c65fed5
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Feb 4, 2024
cbff4f1
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Feb 5, 2024
eef54f2
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Feb 6, 2024
c6540bb
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Feb 12, 2024
2557424
Fix error.
fkdosilovic Feb 14, 2024
4ce1d58
Merge branch 'Add-retry-mechanism-for-fetch_xx-functions' of github.c…
fkdosilovic Feb 14, 2024
81b1f1c
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Feb 14, 2024
c108842
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Feb 20, 2024
edab08e
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Feb 22, 2024
f236247
Update delay type.
fkdosilovic Feb 22, 2024
f56388e
Update.
fkdosilovic Feb 23, 2024
58fb042
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Feb 23, 2024
0bc2fb6
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
lesteve Feb 24, 2024
6547842
Merge branch 'main' into Add-retry-mechanism-for-fetch_xx-functions
fkdosilovic Feb 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions doc/whats_new/v1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,23 @@ Changelog
only `inverse_func` is provided without `func` (that would default to identity) being
explicitly set as well. :pr:`28483` by :user:`Stefanie Senger <StefanieSenger>`.

:mod:`sklearn.datasets`
.......................

- |Enhancement| Adds optional arguments `n_retries` and `delay` to functions
:func:`datasets.fetch_20newsgroups`,
:func:`datasets.fetch_20newsgroups_vectorized`,
:func:`datasets.fetch_california_housing`,
:func:`datasets.fetch_covtype`,
:func:`datasets.fetch_kddcup99`,
:func:`datasets.fetch_lfw_pairs`,
:func:`datasets.fetch_lfw_people`,
:func:`datasets.fetch_olivetti_faces`,
:func:`datasets.fetch_rcv1`,
and :func:`datasets.fetch_species_distributions`.
By default, the functions will retry up to 3 times in case of network failures.
:pr:`28160` by :user:`Zhehao Liu <MaxwellLZH>` and :user:`Filip Karlo Došilović <fkdosilovic>`.

:mod:`sklearn.dummy`
....................

Expand Down
28 changes: 26 additions & 2 deletions sklearn/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@
import hashlib
import os
import shutil
import time
import warnings
from collections import namedtuple
from importlib import resources
from numbers import Integral
from os import environ, listdir, makedirs
from os.path import expanduser, isdir, join, splitext
from pathlib import Path
from urllib.error import URLError
from urllib.request import urlretrieve

import numpy as np
Expand Down Expand Up @@ -1408,7 +1411,7 @@ def _sha256(path):
return sha256hash.hexdigest()


def _fetch_remote(remote, dirname=None):
def _fetch_remote(remote, dirname=None, n_retries=3, delay=1):
"""Helper function to download a remote dataset into path

Fetch a dataset pointed by remote's url, save into path using remote's
Expand All @@ -1424,14 +1427,35 @@ def _fetch_remote(remote, dirname=None):
dirname : str
Directory to save the file to.

n_retries : int, default=3
Number of retries when HTTP errors are encountered.

.. versionadded:: 1.5

delay : int, default=1
Number of seconds between retries.

.. versionadded:: 1.5

Returns
-------
file_path: str
Full path of the created file.
"""

file_path = remote.filename if dirname is None else join(dirname, remote.filename)
urlretrieve(remote.url, file_path)
while True:
try:
urlretrieve(remote.url, file_path)
break
except (URLError, TimeoutError):
if n_retries == 0:
# If no more retries are left, re-raise the caught exception.
raise
warnings.warn(f"Retry downloading from url: {remote.url}")
n_retries -= 1
time.sleep(delay)

checksum = _sha256(file_path)
if remote.checksum != checksum:
raise OSError(
Expand Down
30 changes: 27 additions & 3 deletions sklearn/datasets/_california_housing.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@

import logging
import tarfile
from numbers import Integral, Real
from os import PathLike, makedirs, remove
from os.path import exists

import joblib
import numpy as np

from ..utils import Bunch
from ..utils._param_validation import validate_params
from ..utils._param_validation import Interval, validate_params
from . import get_data_home
from ._base import (
RemoteFileMetadata,
Expand All @@ -57,11 +58,19 @@
"download_if_missing": ["boolean"],
"return_X_y": ["boolean"],
"as_frame": ["boolean"],
"n_retries": [Interval(Integral, 1, None, closed="left")],
"delay": [Interval(Real, 0.0, None, closed="neither")],
},
prefer_skip_nested_validation=True,
)
def fetch_california_housing(
*, data_home=None, download_if_missing=True, return_X_y=False, as_frame=False
*,
data_home=None,
download_if_missing=True,
return_X_y=False,
as_frame=False,
n_retries=3,
delay=1.0,
):
"""Load the California housing dataset (regression).

Expand Down Expand Up @@ -97,6 +106,16 @@ def fetch_california_housing(

.. versionadded:: 0.23

n_retries : int, default=3
Number of retries when HTTP errors are encountered.

.. versionadded:: 1.5

delay : float, default=1.0
Number of seconds between retries.

.. versionadded:: 1.5

Returns
-------
dataset : :class:`~sklearn.utils.Bunch`
Expand Down Expand Up @@ -154,7 +173,12 @@ def fetch_california_housing(
"Downloading Cal. housing from {} to {}".format(ARCHIVE.url, data_home)
)

archive_path = _fetch_remote(ARCHIVE, dirname=data_home)
archive_path = _fetch_remote(
ARCHIVE,
dirname=data_home,
n_retries=n_retries,
delay=delay,
)

with tarfile.open(mode="r:gz", name=archive_path) as f:
cal_housing = np.loadtxt(
Expand Down
21 changes: 19 additions & 2 deletions sklearn/datasets/_covtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
import logging
import os
from gzip import GzipFile
from numbers import Integral, Real
from os.path import exists, join
from tempfile import TemporaryDirectory

import joblib
import numpy as np

from ..utils import Bunch, check_random_state
from ..utils._param_validation import validate_params
from ..utils._param_validation import Interval, validate_params
from . import get_data_home
from ._base import (
RemoteFileMetadata,
Expand Down Expand Up @@ -71,6 +72,8 @@
"shuffle": ["boolean"],
"return_X_y": ["boolean"],
"as_frame": ["boolean"],
"n_retries": [Interval(Integral, 1, None, closed="left")],
"delay": [Interval(Real, 0.0, None, closed="neither")],
},
prefer_skip_nested_validation=True,
)
Expand All @@ -82,6 +85,8 @@ def fetch_covtype(
shuffle=False,
return_X_y=False,
as_frame=False,
n_retries=3,
delay=1.0,
):
"""Load the covertype dataset (classification).

Expand Down Expand Up @@ -129,6 +134,16 @@ def fetch_covtype(

.. versionadded:: 0.24

n_retries : int, default=3
Number of retries when HTTP errors are encountered.

.. versionadded:: 1.5

delay : float, default=1.0
Number of seconds between retries.

.. versionadded:: 1.5

Returns
-------
dataset : :class:`~sklearn.utils.Bunch`
Expand Down Expand Up @@ -183,7 +198,9 @@ def fetch_covtype(
# os.rename to atomically move the data files to their target location.
with TemporaryDirectory(dir=covtype_dir) as temp_dir:
logger.info(f"Downloading {ARCHIVE.url}")
archive_path = _fetch_remote(ARCHIVE, dirname=temp_dir)
archive_path = _fetch_remote(
ARCHIVE, dirname=temp_dir, _retries=n_retries, delay=delay
)
Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=",")

X = Xy[:, :-1]
Expand Down
31 changes: 28 additions & 3 deletions sklearn/datasets/_kddcup99.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
import logging
import os
from gzip import GzipFile
from numbers import Integral, Real
from os.path import exists, join

import joblib
import numpy as np

from ..utils import Bunch, check_random_state
from ..utils import shuffle as shuffle_method
from ..utils._param_validation import StrOptions, validate_params
from ..utils._param_validation import Interval, StrOptions, validate_params
from . import get_data_home
from ._base import (
RemoteFileMetadata,
Expand Down Expand Up @@ -57,6 +58,8 @@
"download_if_missing": ["boolean"],
"return_X_y": ["boolean"],
"as_frame": ["boolean"],
"n_retries": [Interval(Integral, 1, None, closed="left")],
"delay": [Interval(Real, 0.0, None, closed="neither")],
},
prefer_skip_nested_validation=True,
)
Expand All @@ -70,6 +73,8 @@ def fetch_kddcup99(
download_if_missing=True,
return_X_y=False,
as_frame=False,
n_retries=3,
delay=1.0,
):
"""Load the kddcup99 dataset (classification).

Expand Down Expand Up @@ -127,6 +132,16 @@ def fetch_kddcup99(

.. versionadded:: 0.24

n_retries : int, default=3
Number of retries when HTTP errors are encountered.

.. versionadded:: 1.5

delay : float, default=1.0
Number of seconds between retries.

.. versionadded:: 1.5

Returns
-------
data : :class:`~sklearn.utils.Bunch`
Expand Down Expand Up @@ -160,6 +175,8 @@ def fetch_kddcup99(
data_home=data_home,
percent10=percent10,
download_if_missing=download_if_missing,
n_retries=n_retries,
delay=delay,
)

data = kddcup99.data
Expand Down Expand Up @@ -243,7 +260,9 @@ def fetch_kddcup99(
)


def _fetch_brute_kddcup99(data_home=None, download_if_missing=True, percent10=True):
def _fetch_brute_kddcup99(
data_home=None, download_if_missing=True, percent10=True, n_retries=3, delay=1.0
):
"""Load the kddcup99 dataset, downloading it if necessary.

Parameters
Expand All @@ -259,6 +278,12 @@ def _fetch_brute_kddcup99(data_home=None, download_if_missing=True, percent10=Tr
percent10 : bool, default=True
Whether to load only 10 percent of the data.

n_retries : int, default=3
Number of retries when HTTP errors are encountered.

delay : float, default=1.0
Number of seconds between retries.

Returns
-------
dataset : :class:`~sklearn.utils.Bunch`
Expand Down Expand Up @@ -354,7 +379,7 @@ def _fetch_brute_kddcup99(data_home=None, download_if_missing=True, percent10=Tr
elif download_if_missing:
_mkdirp(kddcup_dir)
logger.info("Downloading %s" % archive.url)
_fetch_remote(archive, dirname=kddcup_dir)
_fetch_remote(archive, dirname=kddcup_dir, n_retries=n_retries, delay=delay)
DT = np.dtype(dt)
logger.debug("extracting archive")
archive_path = join(kddcup_dir, archive.filename)
Expand Down
Loading