diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a1f21723bac28..82aa5ebe04f25 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -68,6 +68,9 @@ Changelog `Thomas Fan`_ and :user:`Amanda Dsouza ` and :user:`EL-ATEIF Sara `. +- |Enhancement| :func:`datasets.fetch_kddcup99` raises a better message + when the cached file is invalid. :pr:`19669` `Thomas Fan`_. + :mod:`sklearn.decomposition` ............................ diff --git a/sklearn/conftest.py b/sklearn/conftest.py index 2978115e3091c..70fec749b7c8e 100644 --- a/sklearn/conftest.py +++ b/sklearn/conftest.py @@ -35,8 +35,11 @@ def wrapped(*args, **kwargs): kwargs['download_if_missing'] = download_if_missing try: return f(*args, **kwargs) - except IOError: - pytest.skip("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0") + except IOError as e: + if str(e) != "Data not found and `download_if_missing` is False": + raise + pytest.skip("test is enabled when " + "SKLEARN_SKIP_NETWORK_TESTS=0") return pytest.fixture(lambda: wrapped) diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index 539b7ffaf862e..26fb14197a211 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -315,7 +315,17 @@ def _fetch_brute_kddcup99(data_home=None, column_names = [c[0] for c in dt] target_names = column_names[-1] feature_names = column_names[:-1] - if download_if_missing and not available: + + if available: + try: + X = joblib.load(samples_path) + y = joblib.load(targets_path) + except Exception as e: + raise IOError( + "The cache for fetch_kddcup99 is invalid, please delete " + f"{str(kddcup_dir)} and run the fetch_kddcup99 again") from e + + elif download_if_missing: _mkdirp(kddcup_dir) logger.info("Downloading %s" % archive.url) _fetch_remote(archive, dirname=kddcup_dir) @@ -343,15 +353,8 @@ def _fetch_brute_kddcup99(data_home=None, joblib.dump(X, samples_path, compress=0) joblib.dump(y, targets_path, compress=0) - elif not available: - if not download_if_missing: - raise IOError("Data not found and `download_if_missing` is False") - - try: - X, y - except NameError: - X = joblib.load(samples_path) - y = joblib.load(targets_path) + else: + raise IOError("Data not found and `download_if_missing` is False") return Bunch( data=X, diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 5119d0cda13a2..08017298d20e8 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -58,3 +58,19 @@ def test_fetch_kddcup99_shuffle(fetch_kddcup99_fxt): def test_pandas_dependency_message(fetch_kddcup99_fxt, hide_available_pandas): check_pandas_dependency_message(fetch_kddcup99_fxt) + + +def test_corrupted_file_error_message(fetch_kddcup99_fxt, tmp_path): + """Check that a nice error message is raised when cache is corrupted.""" + kddcup99_dir = tmp_path / "kddcup99_10-py3" + kddcup99_dir.mkdir() + samples_path = kddcup99_dir / "samples" + + with samples_path.open("wb") as f: + f.write(b"THIS IS CORRUPTED") + + msg = (f"The cache for fetch_kddcup99 is invalid, please " + f"delete {str(kddcup99_dir)} and run the fetch_kddcup99 again") + + with pytest.raises(IOError, match=msg): + fetch_kddcup99_fxt(data_home=str(tmp_path))