Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v0.22.rst
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ Changelog
:func:`datasets.fetch_20newsgroups` and :func:`datasets.fetch_olivetti_faces`
. :pr:`14259` by :user:`Sourav Singh <souravsingh>`.

- |Fix| Fixed a bug in :func:`datasets.fetch_openml`, which failed to load
an OpenML dataset that contains an ignored feature.
:pr:`14623` by :user:`Sarra Habchi <HabchiSarra>`.

:mod:`sklearn.decomposition`
............................

Expand Down
37 changes: 23 additions & 14 deletions sklearn/datasets/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,17 +424,28 @@ def _get_data_qualities(data_id, data_home):
return None


def _get_data_shape(data_qualities):
# Using the data_info dictionary from _get_data_info_by_name to extract
# the number of samples / features
def _get_num_samples(data_qualities):
"""Get the number of samples from data qualities.

Parameters
----------
data_qualities : list of dict
Used to retrieve the number of instances (samples) in the dataset.

Returns
-------
n_samples : int
The number of samples in the dataset or -1 if data qualities are
unavailable.
"""
# If the data qualities are unavailable, we return -1
default_n_samples = -1

if data_qualities is None:
return None
return default_n_samples

qualities = {d['name']: d['value'] for d in data_qualities}
try:
return (int(float(qualities['NumberOfInstances'])),
int(float(qualities['NumberOfFeatures'])))
except AttributeError:
return None
return int(float(qualities.get('NumberOfInstances', default_n_samples)))


def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
Expand Down Expand Up @@ -708,12 +719,10 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,

# determine arff encoding to return
if not return_sparse:
# The shape must include the ignored features to keep the right indexes
# during the arff data conversion.
data_qualities = _get_data_qualities(data_id, data_home)
shape = _get_data_shape(data_qualities)
# if the data qualities were not available, we can still get the
# n_features from the feature list, with the n_samples unknown
if shape is None:
shape = (-1, len(features_list))
shape = _get_num_samples(data_qualities), len(features_list)
else:
shape = None

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
15 changes: 15 additions & 0 deletions sklearn/datasets/tests/test_openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -1158,3 +1158,18 @@ def test_fetch_openml_raises_illegal_argument():

assert_raise_message(ValueError, "Neither name nor data_id are provided. "
"Please provide name or data_id.", fetch_openml)


@pytest.mark.parametrize('gzip_response', [True, False])
def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
# Regression test for #14340
# 62 is the ID of the ZOO dataset
data_id = 62
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)

dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False)
assert dataset is not None
# The dataset has 17 features, including 1 ignored (animal),
# so we assert that we don't have the ignored feature in the final Bunch
assert dataset['data'].shape == (101, 16)
assert 'animal' not in dataset['feature_names']