From d386d718b551fd9704077a204548650811ffb0c5 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 12 Mar 2021 20:41:39 -0500 Subject: [PATCH 1/2] ENH Better error for corrupted files in fetch_kddcup99 --- sklearn/conftest.py | 7 +++++-- sklearn/datasets/_kddcup99.py | 23 +++++++++++++---------- sklearn/datasets/tests/test_kddcup99.py | 16 ++++++++++++++++ 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/sklearn/conftest.py b/sklearn/conftest.py index 2978115e3091c..70fec749b7c8e 100644 --- a/sklearn/conftest.py +++ b/sklearn/conftest.py @@ -35,8 +35,11 @@ def wrapped(*args, **kwargs): kwargs['download_if_missing'] = download_if_missing try: return f(*args, **kwargs) - except IOError: - pytest.skip("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0") + except IOError as e: + if str(e) != "Data not found and `download_if_missing` is False": + raise + pytest.skip("test is enabled when " + "SKLEARN_SKIP_NETWORK_TESTS=0") return pytest.fixture(lambda: wrapped) diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index 539b7ffaf862e..26fb14197a211 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -315,7 +315,17 @@ def _fetch_brute_kddcup99(data_home=None, column_names = [c[0] for c in dt] target_names = column_names[-1] feature_names = column_names[:-1] - if download_if_missing and not available: + + if available: + try: + X = joblib.load(samples_path) + y = joblib.load(targets_path) + except Exception as e: + raise IOError( + "The cache for fetch_kddcup99 is invalid, please delete " + f"{str(kddcup_dir)} and run the fetch_kddcup99 again") from e + + elif download_if_missing: _mkdirp(kddcup_dir) logger.info("Downloading %s" % archive.url) _fetch_remote(archive, dirname=kddcup_dir) @@ -343,15 +353,8 @@ def _fetch_brute_kddcup99(data_home=None, joblib.dump(X, samples_path, compress=0) joblib.dump(y, targets_path, compress=0) - elif not available: - if not download_if_missing: - raise IOError("Data not found and `download_if_missing` is False") - - try: - X, y - except NameError: - X = joblib.load(samples_path) - y = joblib.load(targets_path) + else: + raise IOError("Data not found and `download_if_missing` is False") return Bunch( data=X, diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 5119d0cda13a2..08017298d20e8 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -58,3 +58,19 @@ def test_fetch_kddcup99_shuffle(fetch_kddcup99_fxt): def test_pandas_dependency_message(fetch_kddcup99_fxt, hide_available_pandas): check_pandas_dependency_message(fetch_kddcup99_fxt) + + +def test_corrupted_file_error_message(fetch_kddcup99_fxt, tmp_path): + """Check that a nice error message is raised when cache is corrupted.""" + kddcup99_dir = tmp_path / "kddcup99_10-py3" + kddcup99_dir.mkdir() + samples_path = kddcup99_dir / "samples" + + with samples_path.open("wb") as f: + f.write(b"THIS IS CORRUPTED") + + msg = (f"The cache for fetch_kddcup99 is invalid, please " + f"delete {str(kddcup99_dir)} and run the fetch_kddcup99 again") + + with pytest.raises(IOError, match=msg): + fetch_kddcup99_fxt(data_home=str(tmp_path)) From e46c9516cc3e8e7533996471a5e58f2b7c97baf2 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 12 Mar 2021 20:57:36 -0500 Subject: [PATCH 2/2] DOC Adds whats new --- doc/whats_new/v1.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a1f21723bac28..82aa5ebe04f25 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -68,6 +68,9 @@ Changelog `Thomas Fan`_ and :user:`Amanda Dsouza ` and :user:`EL-ATEIF Sara `. +- |Enhancement| :func:`datasets.fetch_kddcup99` raises a better message + when the cached file is invalid. :pr:`19669` `Thomas Fan`_. + :mod:`sklearn.decomposition` ............................