From 1c26cb261928c6a2b17430f35e304a77813fe9b9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 25 May 2023 16:36:45 +0200
Subject: [PATCH 1/4] ENH allows to overwrite read_csv parameter in
 fetch_openml

---
 doc/whats_new/v1.3.rst                |  5 ++++
 sklearn/datasets/_openml.py           | 34 +++++++++++++++++++++++++--
 sklearn/datasets/tests/test_openml.py | 28 ++++++++++++++++++++++
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 38bf60ab4cd64..91e3b8637dc53 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -238,6 +238,11 @@ Changelog
 - |Fix| :func:`datasets.fetch_openml` returns improved data types when
   `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_.
 
+- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using
+  the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the
+  pandas parser.
+  :pr:`25488` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.decomposition`
 ............................
 
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 28713ca739ffd..047fc87c1f18c 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -428,6 +428,7 @@ def _load_arff_response(
     md5_checksum: str,
     n_retries: int = 3,
     delay: float = 1.0,
+    read_csv_kwargs: Optional[Dict] = None,
 ):
     """Load the ARFF data associated with the OpenML URL.
 
@@ -470,6 +471,18 @@ def _load_arff_response(
     md5_checksum : str
         The MD5 checksum provided by OpenML to check the data integrity.
 
+    n_retries : int, default=3
+        The number of times to retry downloading the data if it fails.
+
+    delay : float, default=1.0
+        The delay between two consecutive downloads in seconds.
+
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
+        the default options.
+
+        .. versionadded:: 1.3
+
     Returns
     -------
     X : {ndarray, sparse matrix, dataframe}
@@ -506,13 +519,14 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
         with closing(gzip_file):
             return load_arff_from_gzip_file(gzip_file, **arff_params)
 
-    arff_params = dict(
+    arff_params: Dict = dict(
         parser=parser,
         output_type=output_type,
         openml_columns_info=openml_columns_info,
         feature_names_to_select=feature_names_to_select,
         target_names_to_select=target_names_to_select,
         shape=shape,
+        read_csv_kwargs=read_csv_kwargs or {},
     )
     try:
         X, y, frame, categories = _open_url_and_load_gzip_file(
@@ -530,7 +544,7 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
         # A parsing error could come from providing the wrong quotechar
         # to pandas. By default, we use a double quote. Thus, we retry
         # with a single quote before to raise the error.
-        arff_params["read_csv_kwargs"] = {"quotechar": "'"}
+        arff_params["read_csv_kwargs"].update(quotechar="'")
         X, y, frame, categories = _open_url_and_load_gzip_file(
             url, data_home, n_retries, delay, arff_params
         )
@@ -552,6 +566,7 @@ def _download_data_to_bunch(
     n_retries: int = 3,
     delay: float = 1.0,
     parser: str,
+    read_csv_kwargs: Optional[Dict] = None,
 ):
     """Download ARFF data, load it to a specific container and create to Bunch.
 
@@ -598,6 +613,12 @@ def _download_data_to_bunch(
     parser : {"liac-arff", "pandas"}
         The parser used to parse the ARFF file.
 
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
+        the default options.
+
+        .. versionadded:: 1.3
+
     Returns
     -------
     data : :class:`~sklearn.utils.Bunch`
@@ -657,6 +678,7 @@ def _download_data_to_bunch(
         md5_checksum=md5_checksum,
         n_retries=n_retries,
         delay=delay,
+        read_csv_kwargs=read_csv_kwargs,
     )
 
     return Bunch(
@@ -725,6 +747,7 @@ def fetch_openml(
     n_retries: int = 3,
     delay: float = 1.0,
     parser: Optional[str] = "warn",
+    read_csv_kwargs: Optional[Dict] = None,
 ):
     """Fetch dataset from openml by name or dataset id.
 
@@ -829,6 +852,12 @@ def fetch_openml(
            warning. Therefore, an `ImportError` will be raised from 1.4 if
            the dataset is dense and pandas is not installed.
 
+    read_csv_kwargs : dict, default=None
+        Keyword arguments passed to `pandas.read_csv` when loading the data
+        from a ARFF file. It can allows to overwrite some default parameters.
+
+        .. versionadded:: 1.3
+
     Returns
     -------
     data : :class:`~sklearn.utils.Bunch`
@@ -1096,6 +1125,7 @@ def fetch_openml(
         n_retries=n_retries,
         delay=delay,
         parser=parser_,
+        read_csv_kwargs=read_csv_kwargs,
     )
 
     if return_X_y:
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index fef03a5fd4d40..6f2ba36ff808a 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1354,6 +1354,34 @@ def test_dataset_with_openml_warning(monkeypatch, gzip_response):
         fetch_openml(data_id=data_id, cache=False, as_frame=False, parser="liac-arff")
 
 
+def test_fetch_openml_overwrite_default_params_read_csv(monkeypatch):
+    """Check that we can overwrite the default parameters of `read_csv`."""
+    pytest.importorskip("pandas")
+    data_id = 1590
+    _monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)
+
+    common_params = {
+        "data_id": data_id,
+        "as_frame": True,
+        "cache": False,
+        "parser": "pandas",
+    }
+
+    # By default, the initial spaces are skipped. We checked that setting the parameter
+    # `skipinitialspace` to False will have an effect.
+    adult_without_spaces = fetch_openml(**common_params)
+    adult_with_spaces = fetch_openml(
+        **common_params, read_csv_kwargs={"skipinitialspace": False}
+    )
+    assert all(
+        cat.startswith(" ") for cat in adult_with_spaces.frame["class"].cat.categories
+    )
+    assert not any(
+        cat.startswith(" ")
+        for cat in adult_without_spaces.frame["class"].cat.categories
+    )
+
+
 ###############################################################################
 # Test cache, retry mechanisms, checksum, etc.
 

From e5802ff42e7b1f821bb82f19b70e33bea5d100ef Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 25 May 2023 16:54:20 +0200
Subject: [PATCH 2/4] update pr number

---
 doc/whats_new/v1.3.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 91e3b8637dc53..a4d8de42c514b 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -241,7 +241,7 @@ Changelog
 - |Enhancement| Allows to overwrite the parameters used to open the ARFF file using
   the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the
   pandas parser.
-  :pr:`25488` by :user:`Guillaume Lemaitre <glemaitre>`.
+  :pr:`26433` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 :mod:`sklearn.decomposition`
 ............................

From e6005888790c6716a763878baa18cd2b5e4684dd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 25 May 2023 18:02:37 +0200
Subject: [PATCH 3/4] Apply suggestions from code review

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/datasets/_openml.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 047fc87c1f18c..2aa997bec8b12 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -478,8 +478,8 @@ def _load_arff_response(
         The delay between two consecutive downloads in seconds.
 
     read_csv_kwargs : dict, default=None
-        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
-        the default options.
+        Keyword arguments to pass to `pandas.read_csv` when using the pandas parser.
+        It allows to overwrite the default options.
 
         .. versionadded:: 1.3
 
@@ -614,8 +614,8 @@ def _download_data_to_bunch(
         The parser used to parse the ARFF file.
 
     read_csv_kwargs : dict, default=None
-        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
-        the default options.
+        Keyword arguments to pass to `pandas.read_csv` when using the pandas parser.
+        It allows to overwrite the default options.
 
         .. versionadded:: 1.3
 
@@ -854,7 +854,8 @@ def fetch_openml(
 
     read_csv_kwargs : dict, default=None
         Keyword arguments passed to `pandas.read_csv` when loading the data
-        from a ARFF file. It can allows to overwrite some default parameters.
+        from a ARFF file and using the pandas parser. It can allows to
+        overwrite some default parameters.
 
         .. versionadded:: 1.3
 

From 25eb07c565d6e68de2b03e3828ecb15f6c871c3b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 7 Jun 2023 13:59:30 +0200
Subject: [PATCH 4/4] Update sklearn/datasets/_openml.py

Co-authored-by: Adrin Jalali <adrin.jalali@gmail.com>
---
 sklearn/datasets/_openml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 2aa997bec8b12..d1a1285eb5a37 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -853,7 +853,7 @@ def fetch_openml(
            the dataset is dense and pandas is not installed.
 
     read_csv_kwargs : dict, default=None
-        Keyword arguments passed to `pandas.read_csv` when loading the data
+        Keyword arguments passed to :func:`pandas.read_csv` when loading the data
         from a ARFF file and using the pandas parser. It can allows to
         overwrite some default parameters.