diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 76138d3bea483..2ba3a3494bc6a 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -274,6 +274,11 @@ Changelog - |Fix| :func:`datasets.fetch_openml` returns improved data types when `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_. +- |Fix| Following the ARFF specs, only the marker `"?"` is now considered as a missing + values when opening ARFF files fetched using :func:`datasets.fetch_openml` when using + the pandas parser. The parameter `read_csv_kwargs` allows to overwrite this behaviour. + :pr:`26551` by :user:`Guillaume Lemaitre `. + - |Enhancement| Allows to overwrite the parameters used to open the ARFF file using the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the pandas parser. diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index 26a394b95f790..7b2faa4b67f4d 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -387,6 +387,7 @@ def _pandas_arff_parser( "header": None, "index_col": False, # always force pandas to not use the first column as index "na_values": ["?"], # missing values are represented by `?` + "keep_default_na": False, # only `?` is a missing value given the ARFF specs "comment": "%", # skip line starting by `%` since they are comments "quotechar": '"', # delimiter to use for quoted strings "skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs