From e9a48413219a62de2e448a96ba22163367b81a25 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 9 Jun 2023 10:26:42 +0200 Subject: [PATCH 1/2] FIX only consider ? as missing marker as per ARFF specs --- doc/whats_new/v1.3.rst | 5 +++++ sklearn/datasets/_arff_parser.py | 1 + 2 files changed, 6 insertions(+) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 76138d3bea483..116f879d85a46 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -274,6 +274,11 @@ Changelog - |Fix| :func:`datasets.fetch_openml` returns improved data types when `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_. +- |Fix| Following the ARFF specs, only the marker `"?"` is now considered as a missing + values when opening ARFF files fetched using :func:`datasets.fetch_openml` when using + the pandas parser. The parameter `read_csv_kwargs` allows to overwrite this behaviour. + :pr:`xxx` by :user:`Guillaume Lemaitre `. + - |Enhancement| Allows to overwrite the parameters used to open the ARFF file using the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the pandas parser. diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index 26a394b95f790..7b2faa4b67f4d 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -387,6 +387,7 @@ def _pandas_arff_parser( "header": None, "index_col": False, # always force pandas to not use the first column as index "na_values": ["?"], # missing values are represented by `?` + "keep_default_na": False, # only `?` is a missing value given the ARFF specs "comment": "%", # skip line starting by `%` since they are comments "quotechar": '"', # delimiter to use for quoted strings "skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs From e7bcbd2266bdab66aad78f4d371c4aa0cc4e05d6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 9 Jun 2023 10:29:25 +0200 Subject: [PATCH 2/2] update pr number --- doc/whats_new/v1.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 116f879d85a46..2ba3a3494bc6a 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -277,7 +277,7 @@ Changelog - |Fix| Following the ARFF specs, only the marker `"?"` is now considered as a missing values when opening ARFF files fetched using :func:`datasets.fetch_openml` when using the pandas parser. The parameter `read_csv_kwargs` allows to overwrite this behaviour. - :pr:`xxx` by :user:`Guillaume Lemaitre `. + :pr:`26551` by :user:`Guillaume Lemaitre `. - |Enhancement| Allows to overwrite the parameters used to open the ARFF file using the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the