Thanks to visit codestin.com
Credit goes to github.com

Skip to content

ENH allows to overwrite read_csv parameter in fetch_openml #26433

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,11 @@ Changelog
- |Fix| :func:`datasets.fetch_openml` returns improved data types when
`as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_.

- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using
the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the
pandas parser.
:pr:`26433` by :user:`Guillaume Lemaitre <glemaitre>`.

:mod:`sklearn.decomposition`
............................

Expand Down
35 changes: 33 additions & 2 deletions sklearn/datasets/_openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,7 @@ def _load_arff_response(
md5_checksum: str,
n_retries: int = 3,
delay: float = 1.0,
read_csv_kwargs: Optional[Dict] = None,
):
"""Load the ARFF data associated with the OpenML URL.

Expand Down Expand Up @@ -470,6 +471,18 @@ def _load_arff_response(
md5_checksum : str
The MD5 checksum provided by OpenML to check the data integrity.

n_retries : int, default=3
The number of times to retry downloading the data if it fails.

delay : float, default=1.0
The delay between two consecutive downloads in seconds.

read_csv_kwargs : dict, default=None
Keyword arguments to pass to `pandas.read_csv` when using the pandas parser.
It allows to overwrite the default options.

.. versionadded:: 1.3

Returns
-------
X : {ndarray, sparse matrix, dataframe}
Expand Down Expand Up @@ -506,13 +519,14 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
with closing(gzip_file):
return load_arff_from_gzip_file(gzip_file, **arff_params)

arff_params = dict(
arff_params: Dict = dict(
parser=parser,
output_type=output_type,
openml_columns_info=openml_columns_info,
feature_names_to_select=feature_names_to_select,
target_names_to_select=target_names_to_select,
shape=shape,
read_csv_kwargs=read_csv_kwargs or {},
)
try:
X, y, frame, categories = _open_url_and_load_gzip_file(
Expand All @@ -530,7 +544,7 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
# A parsing error could come from providing the wrong quotechar
# to pandas. By default, we use a double quote. Thus, we retry
# with a single quote before to raise the error.
arff_params["read_csv_kwargs"] = {"quotechar": "'"}
arff_params["read_csv_kwargs"].update(quotechar="'")
X, y, frame, categories = _open_url_and_load_gzip_file(
url, data_home, n_retries, delay, arff_params
)
Expand All @@ -552,6 +566,7 @@ def _download_data_to_bunch(
n_retries: int = 3,
delay: float = 1.0,
parser: str,
read_csv_kwargs: Optional[Dict] = None,
):
"""Download ARFF data, load it to a specific container and create to Bunch.

Expand Down Expand Up @@ -598,6 +613,12 @@ def _download_data_to_bunch(
parser : {"liac-arff", "pandas"}
The parser used to parse the ARFF file.

read_csv_kwargs : dict, default=None
Keyword arguments to pass to `pandas.read_csv` when using the pandas parser.
It allows to overwrite the default options.

.. versionadded:: 1.3

Returns
-------
data : :class:`~sklearn.utils.Bunch`
Expand Down Expand Up @@ -657,6 +678,7 @@ def _download_data_to_bunch(
md5_checksum=md5_checksum,
n_retries=n_retries,
delay=delay,
read_csv_kwargs=read_csv_kwargs,
)

return Bunch(
Expand Down Expand Up @@ -725,6 +747,7 @@ def fetch_openml(
n_retries: int = 3,
delay: float = 1.0,
parser: Optional[str] = "warn",
read_csv_kwargs: Optional[Dict] = None,
):
"""Fetch dataset from openml by name or dataset id.

Expand Down Expand Up @@ -829,6 +852,13 @@ def fetch_openml(
warning. Therefore, an `ImportError` will be raised from 1.4 if
the dataset is dense and pandas is not installed.

read_csv_kwargs : dict, default=None
Keyword arguments passed to :func:`pandas.read_csv` when loading the data
from a ARFF file and using the pandas parser. It can allows to
overwrite some default parameters.

.. versionadded:: 1.3

Returns
-------
data : :class:`~sklearn.utils.Bunch`
Expand Down Expand Up @@ -1096,6 +1126,7 @@ def fetch_openml(
n_retries=n_retries,
delay=delay,
parser=parser_,
read_csv_kwargs=read_csv_kwargs,
)

if return_X_y:
Expand Down
28 changes: 28 additions & 0 deletions sklearn/datasets/tests/test_openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -1354,6 +1354,34 @@ def test_dataset_with_openml_warning(monkeypatch, gzip_response):
fetch_openml(data_id=data_id, cache=False, as_frame=False, parser="liac-arff")


def test_fetch_openml_overwrite_default_params_read_csv(monkeypatch):
"""Check that we can overwrite the default parameters of `read_csv`."""
pytest.importorskip("pandas")
data_id = 1590
_monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)

common_params = {
"data_id": data_id,
"as_frame": True,
"cache": False,
"parser": "pandas",
}

# By default, the initial spaces are skipped. We checked that setting the parameter
# `skipinitialspace` to False will have an effect.
adult_without_spaces = fetch_openml(**common_params)
adult_with_spaces = fetch_openml(
**common_params, read_csv_kwargs={"skipinitialspace": False}
)
assert all(
cat.startswith(" ") for cat in adult_with_spaces.frame["class"].cat.categories
)
assert not any(
cat.startswith(" ")
for cat in adult_without_spaces.frame["class"].cat.categories
)


###############################################################################
# Test cache, retry mechanisms, checksum, etc.

Expand Down