From 672b43420d57401a9cd3086c39be4699b9a35d0d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 3 Sep 2019 08:14:09 +0200 Subject: [PATCH 1/5] check_array float->int casting with NaN --- sklearn/utils/tests/test_validation.py | 22 ++++++++++++++++++++++ sklearn/utils/validation.py | 18 +++++++++++++++--- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 0f7ffe9a3e4f0..ce56dd6f3ab5d 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -201,6 +201,28 @@ def test_check_array_force_all_finite_object(): with pytest.raises(ValueError, match='Input contains NaN'): check_array(X, dtype=None, force_all_finite=True) + # casting a float array containing NaN or inf to int dtype should + # raise an error irrespective of the force_all_finite parameter. + X = np.array([[1, np.nan]]) + + msg = "Input contains NaN, infinity or a value too large for.*int" + with pytest.raises(ValueError, match=msg): + check_array(X, dtype=np.int, force_all_finite=True) + + with pytest.raises(ValueError, match=msg): + check_array(X, dtype=np.int, force_all_finite=False) + + X = np.array([[1, np.inf]]) + + with pytest.raises(ValueError, match=msg): + check_array(X, dtype=np.int) + + X = np.array([[1, np.nan]], dtype=np.object) + + msg = 'cannot convert float NaN to integer' + with pytest.raises(ValueError, match=msg): + check_array(X, dtype=np.int) + @ignore_warnings def test_check_array(): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 465acf48e8293..860d8b74930ed 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -32,7 +32,7 @@ warnings.simplefilter('ignore', NonBLASDotWarning) -def _assert_all_finite(X, allow_nan=False): +def _assert_all_finite(X, allow_nan=False, msg_dtype=None): """Like assert_all_finite, but only for ndarray.""" # validation is also imported in extmath from .extmath import _safe_accumulator_op @@ -52,7 +52,11 @@ def _assert_all_finite(X, allow_nan=False): if (allow_nan and np.isinf(X).any() or not allow_nan and not np.isfinite(X).all()): type_err = 'infinity' if allow_nan else 'NaN, infinity' - raise ValueError(msg_err.format(type_err, X.dtype)) + raise ValueError( + msg_err.format + (type_err, + msg_dtype if msg_dtype is not None else X.dtype) + ) # for object dtype data, we only check for NaNs (GH-13254) elif X.dtype == np.dtype('object') and not allow_nan: if _object_dtype_isnan(X).any(): @@ -494,7 +498,15 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, with warnings.catch_warnings(): try: warnings.simplefilter('error', ComplexWarning) - array = np.asarray(array, dtype=dtype, order=order) + array = np.asarray(array, order=order) + if dtype is not None: + if np.dtype(dtype).kind == 'i' and array.dtype.kind == 'f': + # Conversion float -> int should not contain NaN or + # inf. We cannot use casting='safe' because then + # conversion float -> int would be disallowed. + _assert_all_finite(array, allow_nan=False, + msg_dtype=dtype) + array = array.astype(dtype, casting="unsafe", copy=False) except ComplexWarning: raise ValueError("Complex data not supported\n" "{}\n".format(array)) From d2ccfd0a58d0795844bbe32112a864bd072a2d92 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 3 Sep 2019 12:59:13 +0200 Subject: [PATCH 2/5] Previous behaviour for dtype=np.object --- sklearn/utils/validation.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 860d8b74930ed..416a21ee9e045 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -498,15 +498,17 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, with warnings.catch_warnings(): try: warnings.simplefilter('error', ComplexWarning) - array = np.asarray(array, order=order) - if dtype is not None: - if np.dtype(dtype).kind == 'i' and array.dtype.kind == 'f': - # Conversion float -> int should not contain NaN or - # inf. We cannot use casting='safe' because then - # conversion float -> int would be disallowed. + if dtype is not None and np.dtype(dtype).kind in 'iu': + # Conversion float -> int should not contain NaN or + # inf. We cannot use casting='safe' because then + # conversion float -> int would be disallowed. + array = np.asarray(array, order=order) + if array.dtype.kind == 'f': _assert_all_finite(array, allow_nan=False, msg_dtype=dtype) array = array.astype(dtype, casting="unsafe", copy=False) + else: + array = np.asarray(array, order=order, dtype=dtype) except ComplexWarning: raise ValueError("Complex data not supported\n" "{}\n".format(array)) From 3711329c078d70664b5107c4dc51cc6b9cf55285 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 3 Sep 2019 13:07:31 +0200 Subject: [PATCH 3/5] Comment wording improvement --- sklearn/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 416a21ee9e045..5da8b6f2bed64 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -500,8 +500,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, warnings.simplefilter('error', ComplexWarning) if dtype is not None and np.dtype(dtype).kind in 'iu': # Conversion float -> int should not contain NaN or - # inf. We cannot use casting='safe' because then - # conversion float -> int would be disallowed. + # inf (numpy#14412). We cannot use casting='safe' because + # then conversion float -> int would be disallowed. array = np.asarray(array, order=order) if array.dtype.kind == 'f': _assert_all_finite(array, allow_nan=False, From eb52dbb3aa38ba5bfec711790a3a88641daccc2d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Sep 2019 10:18:13 +0200 Subject: [PATCH 4/5] apply changes --- doc/whats_new/v0.22.rst | 4 +++ sklearn/utils/tests/test_validation.py | 36 ++++++++++++-------------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index ce3174218679f..58c1a0c95706c 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -397,6 +397,10 @@ Changelog a proper error message is raised if X contains some negative entries. :pr:`14680` by :user:`Alex Gramfort `. +- |Fix| :func:`utils.check_array` is now raising an error instead of casting + NaN to integer. + :pr:`14872` by `Roman Yurchak`_. + :mod:`sklearn.neighbors` .................... diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index ce56dd6f3ab5d..d5c0aa444a8e2 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -201,27 +201,25 @@ def test_check_array_force_all_finite_object(): with pytest.raises(ValueError, match='Input contains NaN'): check_array(X, dtype=None, force_all_finite=True) + +@pytest.mark.parametrize( + "X, err_msg", + [(np.array([[1, np.nan]]), + "Input contains NaN, infinity or a value too large for.*int"), + (np.array([[1, np.nan]]), + "Input contains NaN, infinity or a value too large for.*int"), + (np.array([[1, np.inf]]), + "Input contains NaN, infinity or a value too large for.*int"), + (np.array([[1, np.nan]], dtype=np.object), + "cannot convert float NaN to integer")] +) +@pytest.mark.parametrize("force_all_finite", [True, False]) +def test_check_array_force_all_finite_object_unsafe_casting( + X, err_msg, force_all_finite): # casting a float array containing NaN or inf to int dtype should # raise an error irrespective of the force_all_finite parameter. - X = np.array([[1, np.nan]]) - - msg = "Input contains NaN, infinity or a value too large for.*int" - with pytest.raises(ValueError, match=msg): - check_array(X, dtype=np.int, force_all_finite=True) - - with pytest.raises(ValueError, match=msg): - check_array(X, dtype=np.int, force_all_finite=False) - - X = np.array([[1, np.inf]]) - - with pytest.raises(ValueError, match=msg): - check_array(X, dtype=np.int) - - X = np.array([[1, np.nan]], dtype=np.object) - - msg = 'cannot convert float NaN to integer' - with pytest.raises(ValueError, match=msg): - check_array(X, dtype=np.int) + with pytest.raises(ValueError, match=err_msg): + check_array(X, dtype=np.int, force_all_finite=force_all_finite) @ignore_warnings From 9923c3e5249b0d8377fabb358118f23633f0eb13 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Sep 2019 10:21:21 +0200 Subject: [PATCH 5/5] fix merge conflict --- doc/whats_new/v0.22.rst | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 91dbc7848b81f..752c865519e2e 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -537,17 +537,6 @@ Changelog :func:`~utils.estimator_checks.parametrize_with_checks`, to parametrize estimator checks for a list of estimators. :pr:`14381` by `Thomas Fan`_. -- |API| `requires_positive_X` estimator tag (for models that require - X to be non-negative) is now used by `check_estimator` to make sure - a proper error message is raised if X contains some negative entries. - :pr:`14680` by :user:`Alex Gramfort `. - -- |Fix| :func:`utils.check_array` is now raising an error instead of casting - NaN to integer. - :pr:`14872` by `Roman Yurchak`_. - -:mod:`sklearn.neighbors` -.................... - |API| The following utils have been deprecated and are now private: - ``choose_check_classifiers_labels`` - ``enforce_estimator_tags_y`` @@ -562,6 +551,10 @@ Changelog and sparse matrix. :pr:`14538` by :user:`Jérémie du Boisberranger `. +- |Fix| :func:`utils.check_array` is now raising an error instead of casting + NaN to integer. + :pr:`14872` by `Roman Yurchak`_. + :mod:`sklearn.metrics` ..................................