From ed30fd1bd7b7367da71764a126b95e5e926a583b Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 4 Dec 2019 19:30:30 -0500 Subject: [PATCH 1/7] BUG Fixes pandas dataframe bug with boolean dtypes --- sklearn/utils/tests/test_validation.py | 23 +++++++++++++++++++++++ sklearn/utils/validation.py | 9 ++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 56efb98a8b2d8..fcd38873134f2 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -826,6 +826,29 @@ def test_check_dataframe_warns_on_dtype(): assert len(record) == 0 +def test_check_dataframe_mixed_float_dtypes(): + # pandas dataframe will coerce a boolean into a object, this is a mismatch + # with np.result_type which will return a float + # check_array needs explicity check for bool dtype in a dataframe for this + # situation + # https://github.com/scikit-learn/scikit-learn/issues/15787 + + pd = importorskip("pandas") + df = pd.DataFrame({ + 'int': [1, 2, 3], + 'float': [0, 0.1, 2.1], + 'bool': [True, False, True]}) + + array = check_array(df, dtype=(np.float64, np.float32, np.float16)) + + expected_array = np.array( + [[1.0, 0.0, 1.0], + [2.0, 0.1, 0.0], + [3.0, 2.1, 1.0]], dtype=np.float) + + assert_allclose_dense_sparse(array, expected_array) + + class DummyMemory: def cache(self, func): return func diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 424cf4b5180a3..eda522e57704c 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -454,9 +454,12 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # DataFrame), and store them. If not, store None. dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): - dtypes_orig = np.array(array.dtypes) - if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): - dtype_orig = np.result_type(*array.dtypes) + dtypes_orig = list(array.dtypes) + # pandas boolean dtype __array__ interface coerces bools to objects + if any(dtype.kind == 'b' for dtype in dtypes_orig): + dtypes_orig.append(np.object) + elif all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): + dtype_orig = np.result_type(*dtypes_orig) if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": From 1a5a3065c2634f0e21097c03575ade054825e142 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 4 Dec 2019 19:34:13 -0500 Subject: [PATCH 2/7] BUG Fixes pandas dataframe bug with boolean dtypes --- sklearn/utils/validation.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index eda522e57704c..fb34f3b3cccbd 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -456,9 +456,11 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): dtypes_orig = list(array.dtypes) # pandas boolean dtype __array__ interface coerces bools to objects - if any(dtype.kind == 'b' for dtype in dtypes_orig): - dtypes_orig.append(np.object) - elif all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): + for i, dtype_iter in enumerate(dtypes_orig): + if dtype_iter.kind == 'b': + dtypes_orig[i] = np.object + + if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) if dtype_numeric: From ad961b31412549ab63cae42a96aac8f746374e73 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 4 Dec 2019 19:35:29 -0500 Subject: [PATCH 3/7] CLN Lowers the number of lines --- sklearn/utils/tests/test_validation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index fcd38873134f2..96433f8916aeb 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -840,12 +840,10 @@ def test_check_dataframe_mixed_float_dtypes(): 'bool': [True, False, True]}) array = check_array(df, dtype=(np.float64, np.float32, np.float16)) - expected_array = np.array( [[1.0, 0.0, 1.0], [2.0, 0.1, 0.0], [3.0, 2.1, 1.0]], dtype=np.float) - assert_allclose_dense_sparse(array, expected_array) From 903d16d7c4833216923f0881139ae748615b64be Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 5 Dec 2019 11:26:36 -0500 Subject: [PATCH 4/7] TST Fix for Python 3.5 ordering --- sklearn/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 96433f8916aeb..4781ad03c37d3 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -837,7 +837,7 @@ def test_check_dataframe_mixed_float_dtypes(): df = pd.DataFrame({ 'int': [1, 2, 3], 'float': [0, 0.1, 2.1], - 'bool': [True, False, True]}) + 'bool': [True, False, True]}, columns=['int', 'float', 'bool']) array = check_array(df, dtype=(np.float64, np.float32, np.float16)) expected_array = np.array( From 271fbb8dce76ae8a2d30c8d6b8e0aa173f156230 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 5 Dec 2019 14:35:13 -0500 Subject: [PATCH 5/7] DOC Adds to whats new --- doc/whats_new/v0.22.rst | 19 +++++++++++++++++++ doc/whats_new/v0.23.rst | 6 ++++++ sklearn/utils/tests/test_validation.py | 4 ++-- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 7b0c031f9196b..c66ab71f1874d 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -2,6 +2,25 @@ .. currentmodule:: sklearn +.. _changes_0_22_1: + +Version 0.22.1 +============== + +**In Development** + +This is a bug-fix release to primarily resolve some packaging issues in version +0.21.0. It also includes minor documentation improvements and some bug fixes. + +Changelog +--------- + +:mod:`sklearn.utils` +.................... + +- |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with + boolean columns to floats. :pr:`15797` by `Thomas Fan`_. + .. _changes_0_22: Version 0.22.0 diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index a1cf4b4dd7d00..670c835436d0f 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -56,3 +56,9 @@ Changelog - |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at transforming. :pr:`15762` by `Thomas Fan`_. + +:mod:`sklearn.utils` +.................... + +- |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with + boolean columns to floats. :pr:`15797` by `Thomas Fan`_. diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 4781ad03c37d3..bdd31f9c4859f 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -829,8 +829,8 @@ def test_check_dataframe_warns_on_dtype(): def test_check_dataframe_mixed_float_dtypes(): # pandas dataframe will coerce a boolean into a object, this is a mismatch # with np.result_type which will return a float - # check_array needs explicity check for bool dtype in a dataframe for this - # situation + # check_array needs to explicitly check for bool dtype in a dataframe for + # this situation # https://github.com/scikit-learn/scikit-learn/issues/15787 pd = importorskip("pandas") From 137f64eaa9540cba306fc71d27074c2cca7aa52e Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 5 Dec 2019 18:02:59 -0500 Subject: [PATCH 6/7] REV Remove whats new from 0.23 --- doc/whats_new/v0.23.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 670c835436d0f..a1cf4b4dd7d00 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -56,9 +56,3 @@ Changelog - |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at transforming. :pr:`15762` by `Thomas Fan`_. - -:mod:`sklearn.utils` -.................... - -- |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with - boolean columns to floats. :pr:`15797` by `Thomas Fan`_. From 0300179d5450149fb20564d13d87400b01cf1f80 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Fri, 6 Dec 2019 07:42:28 -0500 Subject: [PATCH 7/7] DOC Fix --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index c66ab71f1874d..af08b832e9f6f 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -10,7 +10,7 @@ Version 0.22.1 **In Development** This is a bug-fix release to primarily resolve some packaging issues in version -0.21.0. It also includes minor documentation improvements and some bug fixes. +0.22.0. It also includes minor documentation improvements and some bug fixes. Changelog ---------