Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ Decomposition, manifold learning and clustering
Similarly, the ``n_components=None`` case now selects the minimum of
n_samples and n_features. :issue:`8484`. By :user:`Wally Gauze <wallygauze>`.

- Fixed a bug in :func:`datasets.fetch_kddcup99`, where data were not properly
shuffled. :issue:`9731` by `Nicolas Goix`_.

API changes summary
-------------------

Expand Down
13 changes: 5 additions & 8 deletions sklearn/datasets/kddcup99.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False,

"""
data_home = get_data_home(data_home=data_home)
kddcup99 = _fetch_brute_kddcup99(data_home=data_home, shuffle=shuffle,
kddcup99 = _fetch_brute_kddcup99(data_home=data_home,
percent10=percent10,
download_if_missing=download_if_missing)

Expand Down Expand Up @@ -225,12 +225,15 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
if subset == 'SF':
data = np.c_[data[:, 0], data[:, 2], data[:, 4], data[:, 5]]

if shuffle:
data, target = shuffle_method(data, target, random_state=random_state)

return Bunch(data=data, target=target)


def _fetch_brute_kddcup99(data_home=None,
download_if_missing=True, random_state=None,
shuffle=False, percent10=True):
percent10=True):

"""Load the kddcup99 dataset, downloading it if necessary.

Expand All @@ -251,9 +254,6 @@ def _fetch_brute_kddcup99(data_home=None,
If None, the random number generator is the RandomState instance used
by `np.random`.

shuffle : bool, default=False
Whether to shuffle dataset.

percent10 : bool, default=True
Whether to load only 10 percent of the data.

Expand Down Expand Up @@ -372,9 +372,6 @@ def _fetch_brute_kddcup99(data_home=None,
X = joblib.load(samples_path)
y = joblib.load(targets_path)

if shuffle:
X, y = shuffle_method(X, y, random_state=random_state)

return Bunch(data=X, target=y, DESCR=__doc__)


Expand Down
10 changes: 10 additions & 0 deletions sklearn/datasets/tests/test_kddcup99.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,13 @@ def test_percent10():
data = fetch_kddcup99('smtp')
assert_equal(data.data.shape, (9571, 3))
assert_equal(data.target.shape, (9571,))


def test_shuffle():
try:
dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True,
percent10=True, download_if_missing=False)
except IOError:
raise SkipTest("kddcup99 dataset can not be loaded.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this is not tested in CIs?


assert(any(dataset.target[-100:] == b'normal.'))