scikit-learn · jnothman · Aug 13, 2019 · Aug 10, 2019
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -80,6 +80,10 @@ Changelog
   :func:`datasets.fetch_20newsgroups` and :func:`datasets.fetch_olivetti_faces`
   . :pr:`14259` by :user:`Sourav Singh <souravsingh>`.
 
+- |Fix| Fixed a bug in :func:`datasets.fetch_openml`, which failed to load
+  an OpenML dataset that contains an ignored feature.
+  :pr:`14623` by :user:`Sarra Habchi <HabchiSarra>`.
+
 :mod:`sklearn.decomposition`
 ............................
 

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
@@ -424,17 +424,28 @@ def _get_data_qualities(data_id, data_home):
         return None
 
 
-def _get_data_shape(data_qualities):
-    # Using the data_info dictionary from _get_data_info_by_name to extract
-    # the number of samples / features
+def _get_num_samples(data_qualities):
+    """Get the number of samples from data qualities.
+
+    Parameters
+    ----------
+    data_qualities : list of dict
+        Used to retrieve the number of instances (samples) in the dataset.
+
+    Returns
+    -------
+    n_samples : int
+        The number of samples in the dataset or -1 if data qualities are
+        unavailable.
+    """
+    # If the data qualities are unavailable, we return -1
+    default_n_samples = -1
+
     if data_qualities is None:
-        return None
+        return default_n_samples
+
     qualities = {d['name']: d['value'] for d in data_qualities}
-    try:
-        return (int(float(qualities['NumberOfInstances'])),
-                int(float(qualities['NumberOfFeatures'])))
-    except AttributeError:
-        return None
+    return int(float(qualities.get('NumberOfInstances', default_n_samples)))
 
 
 def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
@@ -708,12 +719,10 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
 
     # determine arff encoding to return
     if not return_sparse:
+        # The shape must include the ignored features to keep the right indexes
+        # during the arff data conversion.
         data_qualities = _get_data_qualities(data_id, data_home)
-        shape = _get_data_shape(data_qualities)
-        # if the data qualities were not available, we can still get the
-        # n_features from the feature list, with the n_samples unknown
-        if shape is None:
-            shape = (-1, len(features_list))
+        shape = _get_num_samples(data_qualities), len(features_list)
     else:
         shape = None
 

diff --git a/sklearn/datasets/tests/data/openml/62/api-v1-json-data-62.json.gz b/sklearn/datasets/tests/data/openml/62/api-v1-json-data-62.json.gz
diff --git a/sklearn/datasets/tests/data/openml/62/api-v1-json-data-features-62.json.gz b/sklearn/datasets/tests/data/openml/62/api-v1-json-data-features-62.json.gz
diff --git a/sklearn/datasets/tests/data/openml/62/api-v1-json-data-qualities-62.json.gz b/sklearn/datasets/tests/data/openml/62/api-v1-json-data-qualities-62.json.gz
diff --git a/sklearn/datasets/tests/data/openml/62/data-v1-download-52352.arff.gz b/sklearn/datasets/tests/data/openml/62/data-v1-download-52352.arff.gz
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
@@ -1158,3 +1158,18 @@ def test_fetch_openml_raises_illegal_argument():
 
     assert_raise_message(ValueError, "Neither name nor data_id are provided. "
                          "Please provide name or data_id.", fetch_openml)
+
+
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
+    # Regression test for #14340
+    # 62 is the ID of the ZOO dataset
+    data_id = 62
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
+
+    dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False)
+    assert dataset is not None
+    # The dataset has 17 features, including 1 ignored (animal),
+    # so we assert that we don't have the ignored feature in the final Bunch
+    assert dataset['data'].shape == (101, 16)
+    assert 'animal' not in dataset['feature_names']