From 1c26cb261928c6a2b17430f35e304a77813fe9b9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 25 May 2023 16:36:45 +0200 Subject: [PATCH 1/4] ENH allows to overwrite read_csv parameter in fetch_openml --- doc/whats_new/v1.3.rst | 5 ++++ sklearn/datasets/_openml.py | 34 +++++++++++++++++++++++++-- sklearn/datasets/tests/test_openml.py | 28 ++++++++++++++++++++++ 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 38bf60ab4cd64..91e3b8637dc53 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -238,6 +238,11 @@ Changelog - |Fix| :func:`datasets.fetch_openml` returns improved data types when `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_. +- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using + the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the + pandas parser. + :pr:`25488` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.decomposition` ............................ diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 28713ca739ffd..047fc87c1f18c 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -428,6 +428,7 @@ def _load_arff_response( md5_checksum: str, n_retries: int = 3, delay: float = 1.0, + read_csv_kwargs: Optional[Dict] = None, ): """Load the ARFF data associated with the OpenML URL. @@ -470,6 +471,18 @@ def _load_arff_response( md5_checksum : str The MD5 checksum provided by OpenML to check the data integrity. + n_retries : int, default=3 + The number of times to retry downloading the data if it fails. + + delay : float, default=1.0 + The delay between two consecutive downloads in seconds. + + read_csv_kwargs : dict, default=None + Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite + the default options. + + .. versionadded:: 1.3 + Returns ------- X : {ndarray, sparse matrix, dataframe} @@ -506,13 +519,14 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params): with closing(gzip_file): return load_arff_from_gzip_file(gzip_file, **arff_params) - arff_params = dict( + arff_params: Dict = dict( parser=parser, output_type=output_type, openml_columns_info=openml_columns_info, feature_names_to_select=feature_names_to_select, target_names_to_select=target_names_to_select, shape=shape, + read_csv_kwargs=read_csv_kwargs or {}, ) try: X, y, frame, categories = _open_url_and_load_gzip_file( @@ -530,7 +544,7 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params): # A parsing error could come from providing the wrong quotechar # to pandas. By default, we use a double quote. Thus, we retry # with a single quote before to raise the error. - arff_params["read_csv_kwargs"] = {"quotechar": "'"} + arff_params["read_csv_kwargs"].update(quotechar="'") X, y, frame, categories = _open_url_and_load_gzip_file( url, data_home, n_retries, delay, arff_params ) @@ -552,6 +566,7 @@ def _download_data_to_bunch( n_retries: int = 3, delay: float = 1.0, parser: str, + read_csv_kwargs: Optional[Dict] = None, ): """Download ARFF data, load it to a specific container and create to Bunch. @@ -598,6 +613,12 @@ def _download_data_to_bunch( parser : {"liac-arff", "pandas"} The parser used to parse the ARFF file. + read_csv_kwargs : dict, default=None + Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite + the default options. + + .. versionadded:: 1.3 + Returns ------- data : :class:`~sklearn.utils.Bunch` @@ -657,6 +678,7 @@ def _download_data_to_bunch( md5_checksum=md5_checksum, n_retries=n_retries, delay=delay, + read_csv_kwargs=read_csv_kwargs, ) return Bunch( @@ -725,6 +747,7 @@ def fetch_openml( n_retries: int = 3, delay: float = 1.0, parser: Optional[str] = "warn", + read_csv_kwargs: Optional[Dict] = None, ): """Fetch dataset from openml by name or dataset id. @@ -829,6 +852,12 @@ def fetch_openml( warning. Therefore, an `ImportError` will be raised from 1.4 if the dataset is dense and pandas is not installed. + read_csv_kwargs : dict, default=None + Keyword arguments passed to `pandas.read_csv` when loading the data + from a ARFF file. It can allows to overwrite some default parameters. + + .. versionadded:: 1.3 + Returns ------- data : :class:`~sklearn.utils.Bunch` @@ -1096,6 +1125,7 @@ def fetch_openml( n_retries=n_retries, delay=delay, parser=parser_, + read_csv_kwargs=read_csv_kwargs, ) if return_X_y: diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index fef03a5fd4d40..6f2ba36ff808a 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1354,6 +1354,34 @@ def test_dataset_with_openml_warning(monkeypatch, gzip_response): fetch_openml(data_id=data_id, cache=False, as_frame=False, parser="liac-arff") +def test_fetch_openml_overwrite_default_params_read_csv(monkeypatch): + """Check that we can overwrite the default parameters of `read_csv`.""" + pytest.importorskip("pandas") + data_id = 1590 + _monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False) + + common_params = { + "data_id": data_id, + "as_frame": True, + "cache": False, + "parser": "pandas", + } + + # By default, the initial spaces are skipped. We checked that setting the parameter + # `skipinitialspace` to False will have an effect. + adult_without_spaces = fetch_openml(**common_params) + adult_with_spaces = fetch_openml( + **common_params, read_csv_kwargs={"skipinitialspace": False} + ) + assert all( + cat.startswith(" ") for cat in adult_with_spaces.frame["class"].cat.categories + ) + assert not any( + cat.startswith(" ") + for cat in adult_without_spaces.frame["class"].cat.categories + ) + + ############################################################################### # Test cache, retry mechanisms, checksum, etc. From e5802ff42e7b1f821bb82f19b70e33bea5d100ef Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 25 May 2023 16:54:20 +0200 Subject: [PATCH 2/4] update pr number --- doc/whats_new/v1.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 91e3b8637dc53..a4d8de42c514b 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -241,7 +241,7 @@ Changelog - |Enhancement| Allows to overwrite the parameters used to open the ARFF file using the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the pandas parser. - :pr:`25488` by :user:`Guillaume Lemaitre `. + :pr:`26433` by :user:`Guillaume Lemaitre `. :mod:`sklearn.decomposition` ............................ From e6005888790c6716a763878baa18cd2b5e4684dd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 25 May 2023 18:02:37 +0200 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- sklearn/datasets/_openml.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 047fc87c1f18c..2aa997bec8b12 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -478,8 +478,8 @@ def _load_arff_response( The delay between two consecutive downloads in seconds. read_csv_kwargs : dict, default=None - Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite - the default options. + Keyword arguments to pass to `pandas.read_csv` when using the pandas parser. + It allows to overwrite the default options. .. versionadded:: 1.3 @@ -614,8 +614,8 @@ def _download_data_to_bunch( The parser used to parse the ARFF file. read_csv_kwargs : dict, default=None - Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite - the default options. + Keyword arguments to pass to `pandas.read_csv` when using the pandas parser. + It allows to overwrite the default options. .. versionadded:: 1.3 @@ -854,7 +854,8 @@ def fetch_openml( read_csv_kwargs : dict, default=None Keyword arguments passed to `pandas.read_csv` when loading the data - from a ARFF file. It can allows to overwrite some default parameters. + from a ARFF file and using the pandas parser. It can allows to + overwrite some default parameters. .. versionadded:: 1.3 From 25eb07c565d6e68de2b03e3828ecb15f6c871c3b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 7 Jun 2023 13:59:30 +0200 Subject: [PATCH 4/4] Update sklearn/datasets/_openml.py Co-authored-by: Adrin Jalali --- sklearn/datasets/_openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 2aa997bec8b12..d1a1285eb5a37 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -853,7 +853,7 @@ def fetch_openml( the dataset is dense and pandas is not installed. read_csv_kwargs : dict, default=None - Keyword arguments passed to `pandas.read_csv` when loading the data + Keyword arguments passed to :func:`pandas.read_csv` when loading the data from a ARFF file and using the pandas parser. It can allows to overwrite some default parameters.