From d51df24b04e5069935f9a66476c94e067cd0e422 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 30 Apr 2018 00:35:17 +0200 Subject: [PATCH 01/15] EHN Passthrough DataFrame in FunctionTransformer --- doc/modules/preprocessing.rst | 2 +- doc/whats_new/v0.20.rst | 9 ++ .../preprocessing/_function_transformer.py | 82 ++++++++++++++++--- .../tests/test_function_transformer.py | 78 +++++++++++++++++- 4 files changed, 154 insertions(+), 17 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 19bdfc0d432a0..83c94d094d01a 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -663,7 +663,7 @@ error with a ``filterwarnings``:: >>> import warnings >>> warnings.filterwarnings("error", message=".*check_inverse*.", ... category=UserWarning, append=False) - + For a full code example that demonstrates using a :class:`FunctionTransformer` to do custom feature selection, see :ref:`sphx_glr_auto_examples_preprocessing_plot_function_transformer.py` diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index c522e9bc54b3c..c46b7cb3c5a36 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -260,6 +260,11 @@ Miscellaneous :issue:`9101` by :user:`alex-33 ` and :user:`Maskani Filali Mohamed `. +- :class:`preprocessing.FunctionTransformer` is accepting pandas DataFrame in + ``func`` without converting to a NumPy array when + ``validate='array-or-frame``. :issue:`10655` by :user:`Guillaume Lemaitre + `. + Bug fixes ......... @@ -572,6 +577,10 @@ Misc acts as an upper bound on iterations. :issue:`#10982` by :user:`Juliet Lawton ` +- In :class:`preprocessing.FunctionTransformer`, the default of ``validate`` + will changed from ``True`` to ``'array-or-frame'`` in 0.22. :issue:`10655` by + :user:`Guillaume Lemaitre `. + Changes to estimator checks --------------------------- diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index f2a1290685992..82dc2e517847b 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -3,6 +3,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.testing import assert_allclose_dense_sparse +from ..utils.validation import _assert_all_finite from ..externals.six import string_types @@ -40,18 +41,45 @@ class FunctionTransformer(BaseEstimator, TransformerMixin): kwargs forwarded. If inverse_func is None, then inverse_func will be the identity function. - validate : bool, optional default=True + validate : bool or 'array-or-frame', optional default=True Indicate that the input X array should be checked before calling - func. If validate is false, there will be no input validation. - If it is true, then X will be converted to a 2-dimensional NumPy - array or sparse matrix. If this conversion is not possible or X - contains NaN or infinity, an exception is raised. + func. The possibilities are: + + - If True, then X will be converted to a 2-dimensional NumPy array or + sparse matrix. If the conversion is not possible an exception is + raised. + - If False, then there is no input validation + - If 'array-or-frame', X will be pass-through if this is a pandas + DataFrame or converted to a 2-dimensional array or sparse matrix. In + this latest case, an exception will be raised if the conversion + failed. + + .. deprecated:: 0.20 + ``validate=True`` as default will be replaced by + ``validate='array-or-frame'`` in 0.22. + + .. versionadded:: 0.20 + ``validate`` takes the option ``'array-or-frame'``. accept_sparse : boolean, optional Indicate that func accepts a sparse matrix as input. If validate is False, this has no effect. Otherwise, if accept_sparse is false, sparse matrix inputs will cause an exception to be raised. + force_all_finite : boolean or 'allow-nan', optional default=True + Whether to raise an error on np.inf and np.nan in X. The possibilities + are: + + - If True, force all values of X to be finite. + - If False, accept both np.inf and np.nan in X. + - If 'allow-nan', accept only np.nan values in X. Values cannot be + infinite. + + Applied only when ``validate=True``. + + .. versionadded:: 0.20 + ``force_all_finite`` was added to let pass NaN. + pass_y : bool, optional default=False Indicate that transform should forward the y argument to the inner callable. @@ -72,18 +100,50 @@ class FunctionTransformer(BaseEstimator, TransformerMixin): Dictionary of additional keyword arguments to pass to inverse_func. """ - def __init__(self, func=None, inverse_func=None, validate=True, - accept_sparse=False, pass_y='deprecated', check_inverse=True, - kw_args=None, inv_kw_args=None): + def __init__(self, func=None, inverse_func=None, validate=None, + accept_sparse=False, force_all_finite=True, + pass_y='deprecated', check_inverse=True, kw_args=None, + inv_kw_args=None): self.func = func self.inverse_func = inverse_func self.validate = validate self.accept_sparse = accept_sparse + self.force_all_finite = force_all_finite self.pass_y = pass_y self.check_inverse = check_inverse self.kw_args = kw_args self.inv_kw_args = inv_kw_args + def _check_input(self, X): + # FIXME: Future warning to be removed in 0.22 + if self.validate is None: + self.validate = True + warnings.warn("The default validate=True will be replaced by " + "validate='array-or-frame' in 0.22.", FutureWarning) + + if ((not isinstance(self.validate, bool)) and + self.validate != 'array-or-frame'): + raise ValueError("'validate' should be a boolean or " + "'array-or-frame'. Got {!r} instead." + .format(self.validate)) + if ((not isinstance(self.force_all_finite, bool)) and + self.force_all_finite != 'allow-nan'): + raise ValueError("'force_all_finite' should be a boolean " + "or 'allow-nan'. Got {!r} instead." + .format(self.force_all_finite)) + + if self.validate: + if hasattr(X, 'loc') and self.validate == 'array-or-frame': + if self.force_all_finite: + _assert_all_finite(X.values, allow_nan=False + if self.force_all_finite is True + else True) + return X + else: + return check_array(X, accept_sparse=self.accept_sparse, + force_all_finite=self.force_all_finite) + return X + def _check_inverse_transform(self, X): """Check that func and inverse_func are the inverse.""" idx_selected = slice(None, None, max(1, X.shape[0] // 100)) @@ -111,8 +171,7 @@ def fit(self, X, y=None): ------- self """ - if self.validate: - X = check_array(X, self.accept_sparse) + X = self._check_input(X) if (self.check_inverse and not (self.func is None or self.inverse_func is None)): self._check_inverse_transform(X) @@ -165,8 +224,7 @@ def inverse_transform(self, X, y='deprecated'): kw_args=self.inv_kw_args) def _transform(self, X, y=None, func=None, kw_args=None): - if self.validate: - X = check_array(X, self.accept_sparse) + X = self._check_input(X) if func is None: func = _identity diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 4d166457777cc..d94448dc19115 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -1,3 +1,4 @@ +import pytest import numpy as np from scipy import sparse @@ -145,7 +146,8 @@ def test_check_inverse(): trans = FunctionTransformer(func=np.sqrt, inverse_func=np.around, accept_sparse=accept_sparse, - check_inverse=True) + check_inverse=True, + validate=True) assert_warns_message(UserWarning, "The provided functions are not strictly" " inverse of each other. If you are sure you" @@ -156,15 +158,83 @@ def test_check_inverse(): trans = FunctionTransformer(func=np.expm1, inverse_func=np.log1p, accept_sparse=accept_sparse, - check_inverse=True) + check_inverse=True, + validate=True) Xt = assert_no_warnings(trans.fit_transform, X) assert_allclose_dense_sparse(X, trans.inverse_transform(Xt)) # check that we don't check inverse when one of the func or inverse is not # provided. trans = FunctionTransformer(func=np.expm1, inverse_func=None, - check_inverse=True) + check_inverse=True, validate=True) assert_no_warnings(trans.fit, X_dense) trans = FunctionTransformer(func=None, inverse_func=np.expm1, - check_inverse=True) + check_inverse=True, validate=True) assert_no_warnings(trans.fit, X_dense) + + +@pytest.mark.parametrize( + "X", + [np.array([[0, 1], [2, 3]]), + np.array([[0, 1], [2, np.nan]]), + np.array([[0, 1], [2, np.inf]]), + np.array([[0, 1], [np.inf, np.nan]])] +) +@pytest.mark.parametrize( + "force_all_finite", + [True, False, 'allow-nan'] +) +def test_function_transformer_finiteness_pandas(X, force_all_finite): + pd = pytest.importorskip('pandas') + X_df = pd.DataFrame(X) + + def func(X): + return X.columns + + transformer = FunctionTransformer(force_all_finite=force_all_finite, + validate=True) + + should_fail = False + if force_all_finite is True: + if not np.isfinite(X).all(): + should_fail = True + elif force_all_finite == 'allow-nan': + if np.isinf(X).any(): + should_fail = True + + if should_fail: + with pytest.raises(ValueError, match="Input contains"): + transformer.fit_transform(X_df) + else: + transformer.fit_transform(X_df) + + +def test_function_transformer_future_warning(): + # FIXME: to be removed in 0.22 + X = np.random.randn(100, 10) + transformer = FunctionTransformer() + with pytest.warns(FutureWarning): + transformer.fit_transform(X) + + +def test_function_transformer_frame(): + pd = pytest.importorskip('pandas') + X_df = pd.DataFrame(np.random.randn(100, 10)) + transformer = FunctionTransformer(validate='array-or-frame', + check_inverse=False) + X_df_trans = transformer.fit_transform(X_df) + assert hasattr(X_df_trans, 'loc') + + +@pytest.mark.parametrize( + "params, msg_err", + [({'validate': 'random'}, "'validate' should be"), + ({'validate': True, 'force_all_finite': 'random'}, + "'force_all_finite' should be")] +) +def test_function_transformer_params_errors(params, msg_err): + X = np.random.randn(100, 10) + transformer = FunctionTransformer() + transformer.set_params(**params) + with pytest.raises(ValueError, match=msg_err): + transformer.fit_transform(X) From 34e7e142daa2577e6995ae523a5a7a36f08bc3d2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 30 Apr 2018 00:57:35 +0200 Subject: [PATCH 02/15] FIX do not change mutable --- sklearn/preprocessing/_function_transformer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 82dc2e517847b..81d793ae95f92 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -117,23 +117,25 @@ def __init__(self, func=None, inverse_func=None, validate=None, def _check_input(self, X): # FIXME: Future warning to be removed in 0.22 if self.validate is None: - self.validate = True + self._validate = True warnings.warn("The default validate=True will be replaced by " "validate='array-or-frame' in 0.22.", FutureWarning) + else: + self._validate = self.validate - if ((not isinstance(self.validate, bool)) and - self.validate != 'array-or-frame'): + if ((not isinstance(self._validate, bool)) and + self._validate != 'array-or-frame'): raise ValueError("'validate' should be a boolean or " "'array-or-frame'. Got {!r} instead." - .format(self.validate)) + .format(self._validate)) if ((not isinstance(self.force_all_finite, bool)) and self.force_all_finite != 'allow-nan'): raise ValueError("'force_all_finite' should be a boolean " "or 'allow-nan'. Got {!r} instead." .format(self.force_all_finite)) - if self.validate: - if hasattr(X, 'loc') and self.validate == 'array-or-frame': + if self._validate: + if hasattr(X, 'loc') and self._validate == 'array-or-frame': if self.force_all_finite: _assert_all_finite(X.values, allow_nan=False if self.force_all_finite is True From b08a0b5497fac87fd75659da71a39a2418650ada Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 1 May 2018 13:10:54 +0200 Subject: [PATCH 03/15] FIX joris comments --- doc/whats_new/v0.20.rst | 3 ++- .../preprocessing/_function_transformer.py | 23 ++++++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index c46b7cb3c5a36..61516f1f1078b 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -262,7 +262,8 @@ Miscellaneous - :class:`preprocessing.FunctionTransformer` is accepting pandas DataFrame in ``func`` without converting to a NumPy array when - ``validate='array-or-frame``. :issue:`10655` by :user:`Guillaume Lemaitre + ``validate='array-or-frame``. In addition the parameter ``force_all_finite`` + was added to let pass NaN. :issue:`10655` by :user:`Guillaume Lemaitre `. Bug fixes diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 81d793ae95f92..b40b46523e96e 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -45,14 +45,18 @@ class FunctionTransformer(BaseEstimator, TransformerMixin): Indicate that the input X array should be checked before calling func. The possibilities are: - - If True, then X will be converted to a 2-dimensional NumPy array or - sparse matrix. If the conversion is not possible an exception is - raised. - - If False, then there is no input validation - - If 'array-or-frame', X will be pass-through if this is a pandas + - If 'array-or-frame', X will be passed through if it is a pandas DataFrame or converted to a 2-dimensional array or sparse matrix. In this latest case, an exception will be raised if the conversion failed. + - If True, then X will be converted to a 2-dimensional NumPy array or + sparse matrix. If the conversion is not possible an exception is + raised. + - If False, then there is no input validation. + + When X is validated, the parameters ``accept_sparse`` and + ``force_all_finite`` will control the validation for the sparsity and + the finiteness of X, respectively. .. deprecated:: 0.20 ``validate=True`` as default will be replaced by @@ -75,7 +79,7 @@ class FunctionTransformer(BaseEstimator, TransformerMixin): - If 'allow-nan', accept only np.nan values in X. Values cannot be infinite. - Applied only when ``validate=True``. + This parameter is discarded when ``validate=False``. .. versionadded:: 0.20 ``force_all_finite`` was added to let pass NaN. @@ -118,8 +122,11 @@ def _check_input(self, X): # FIXME: Future warning to be removed in 0.22 if self.validate is None: self._validate = True - warnings.warn("The default validate=True will be replaced by " - "validate='array-or-frame' in 0.22.", FutureWarning) + if hasattr(X, 'loc'): + warnings.warn("The default validate=True will be replaced by " + "validate='array-or-frame' in 0.22. A pandas " + "DataFrame will not be converted to a 2D " + "NumPy array.", FutureWarning) else: self._validate = self.validate From 4638b795731945559e23f8f5d11c25b68b451dc7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 1 May 2018 13:35:47 +0200 Subject: [PATCH 04/15] TST check futurewarnings raised only with dataframe --- .../tests/test_function_transformer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index d94448dc19115..59d17108c6dfb 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -209,12 +209,21 @@ def func(X): transformer.fit_transform(X_df) -def test_function_transformer_future_warning(): +@pytest.mark.parametrize( + "is_dataframe", + [True, False] +) +def test_function_transformer_future_warning(is_dataframe): # FIXME: to be removed in 0.22 X = np.random.randn(100, 10) transformer = FunctionTransformer() - with pytest.warns(FutureWarning): - transformer.fit_transform(X) + if is_dataframe: + pd = pytest.importorskip('pandas') + X_df = pd.DataFrame(X) + with pytest.warns(FutureWarning): + transformer.fit_transform(X_df) + else: + assert_no_warnings(transformer.fit, X) def test_function_transformer_frame(): From 707328849a4e6d4877251bae03d5f3fc8bdb203b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 7 May 2018 00:31:22 +0200 Subject: [PATCH 05/15] address rth comments --- sklearn/preprocessing/_function_transformer.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index b40b46523e96e..e3f719a61709a 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -62,7 +62,7 @@ class FunctionTransformer(BaseEstimator, TransformerMixin): ``validate=True`` as default will be replaced by ``validate='array-or-frame'`` in 0.22. - .. versionadded:: 0.20 + .. versionchanged:: 0.20 ``validate`` takes the option ``'array-or-frame'``. accept_sparse : boolean, optional @@ -71,18 +71,15 @@ class FunctionTransformer(BaseEstimator, TransformerMixin): sparse matrix inputs will cause an exception to be raised. force_all_finite : boolean or 'allow-nan', optional default=True - Whether to raise an error on np.inf and np.nan in X. The possibilities - are: + Whether to raise an error on np.inf and np.nan in X. If validate is + False, this has not effect. The possibilities are: - If True, force all values of X to be finite. - If False, accept both np.inf and np.nan in X. - If 'allow-nan', accept only np.nan values in X. Values cannot be infinite. - This parameter is discarded when ``validate=False``. - .. versionadded:: 0.20 - ``force_all_finite`` was added to let pass NaN. pass_y : bool, optional default=False Indicate that transform should forward the y argument to the From 2f3555a942d77f0d1a6979db4d473299cb8cf5f9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 16 May 2018 17:10:48 +0200 Subject: [PATCH 06/15] review roman comments --- doc/whats_new/v0.20.rst | 4 ++-- sklearn/preprocessing/_function_transformer.py | 13 ++++++------- .../tests/test_function_transformer.py | 10 ++-------- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 61516f1f1078b..1b04e64734d8c 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -260,7 +260,7 @@ Miscellaneous :issue:`9101` by :user:`alex-33 ` and :user:`Maskani Filali Mohamed `. -- :class:`preprocessing.FunctionTransformer` is accepting pandas DataFrame in +- :class:`preprocessing.FunctionTransformer` now accepts pandas DataFrame in ``func`` without converting to a NumPy array when ``validate='array-or-frame``. In addition the parameter ``force_all_finite`` was added to let pass NaN. :issue:`10655` by :user:`Guillaume Lemaitre @@ -579,7 +579,7 @@ Misc :issue:`#10982` by :user:`Juliet Lawton ` - In :class:`preprocessing.FunctionTransformer`, the default of ``validate`` - will changed from ``True`` to ``'array-or-frame'`` in 0.22. :issue:`10655` by + will be from ``'array-or-frame'`` in 0.22. :issue:`10655` by :user:`Guillaume Lemaitre `. Changes to estimator checks diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index e3f719a61709a..db65789a8ed97 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -52,10 +52,10 @@ class FunctionTransformer(BaseEstimator, TransformerMixin): - If True, then X will be converted to a 2-dimensional NumPy array or sparse matrix. If the conversion is not possible an exception is raised. - - If False, then there is no input validation. + - If False, there is no input validation. When X is validated, the parameters ``accept_sparse`` and - ``force_all_finite`` will control the validation for the sparsity and + ``force_all_finite`` control the validation for the sparsity and the finiteness of X, respectively. .. deprecated:: 0.20 @@ -127,8 +127,7 @@ def _check_input(self, X): else: self._validate = self.validate - if ((not isinstance(self._validate, bool)) and - self._validate != 'array-or-frame'): + if self._validate not in (True, False, 'array-or-frame'): raise ValueError("'validate' should be a boolean or " "'array-or-frame'. Got {!r} instead." .format(self._validate)) @@ -141,9 +140,9 @@ def _check_input(self, X): if self._validate: if hasattr(X, 'loc') and self._validate == 'array-or-frame': if self.force_all_finite: - _assert_all_finite(X.values, allow_nan=False - if self.force_all_finite is True - else True) + _assert_all_finite( + X.values, + allow_nan=not (self.force_all_finite is True)) return X else: return check_array(X, accept_sparse=self.accept_sparse, diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 59d17108c6dfb..f8b03a58ee429 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -180,10 +180,7 @@ def test_check_inverse(): np.array([[0, 1], [2, np.inf]]), np.array([[0, 1], [np.inf, np.nan]])] ) -@pytest.mark.parametrize( - "force_all_finite", - [True, False, 'allow-nan'] -) +@pytest.mark.parametrize("force_all_finite", [True, False, 'allow-nan']) def test_function_transformer_finiteness_pandas(X, force_all_finite): pd = pytest.importorskip('pandas') X_df = pd.DataFrame(X) @@ -209,10 +206,7 @@ def func(X): transformer.fit_transform(X_df) -@pytest.mark.parametrize( - "is_dataframe", - [True, False] -) +@pytest.mark.parametrize("is_dataframe", [True, False]) def test_function_transformer_future_warning(is_dataframe): # FIXME: to be removed in 0.22 X = np.random.randn(100, 10) From 0635838bf4828264c4c6d86a48b0c916b2da8afa Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 8 Jun 2018 15:17:35 +0200 Subject: [PATCH 07/15] Default on False and make a conversion if passing a list --- .../preprocessing/_function_transformer.py | 60 ++++-------------- .../tests/test_function_transformer.py | 62 ++----------------- 2 files changed, 17 insertions(+), 105 deletions(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index db65789a8ed97..df375a2ad1b7f 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -1,9 +1,9 @@ import warnings +import numpy as np from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.testing import assert_allclose_dense_sparse -from ..utils.validation import _assert_all_finite from ..externals.six import string_types @@ -45,42 +45,21 @@ class FunctionTransformer(BaseEstimator, TransformerMixin): Indicate that the input X array should be checked before calling func. The possibilities are: - - If 'array-or-frame', X will be passed through if it is a pandas - DataFrame or converted to a 2-dimensional array or sparse matrix. In - this latest case, an exception will be raised if the conversion - failed. + - If False, there is no input validation. If and only if X is a list, + it will be converted to a 2-dimensional NumPy array. - If True, then X will be converted to a 2-dimensional NumPy array or sparse matrix. If the conversion is not possible an exception is raised. - - If False, there is no input validation. - - When X is validated, the parameters ``accept_sparse`` and - ``force_all_finite`` control the validation for the sparsity and - the finiteness of X, respectively. .. deprecated:: 0.20 ``validate=True`` as default will be replaced by - ``validate='array-or-frame'`` in 0.22. - - .. versionchanged:: 0.20 - ``validate`` takes the option ``'array-or-frame'``. + ``validate=False`` in 0.22. accept_sparse : boolean, optional Indicate that func accepts a sparse matrix as input. If validate is False, this has no effect. Otherwise, if accept_sparse is false, sparse matrix inputs will cause an exception to be raised. - force_all_finite : boolean or 'allow-nan', optional default=True - Whether to raise an error on np.inf and np.nan in X. If validate is - False, this has not effect. The possibilities are: - - - If True, force all values of X to be finite. - - If False, accept both np.inf and np.nan in X. - - If 'allow-nan', accept only np.nan values in X. Values cannot be - infinite. - - .. versionadded:: 0.20 - pass_y : bool, optional default=False Indicate that transform should forward the y argument to the inner callable. @@ -119,34 +98,17 @@ def _check_input(self, X): # FIXME: Future warning to be removed in 0.22 if self.validate is None: self._validate = True - if hasattr(X, 'loc'): - warnings.warn("The default validate=True will be replaced by " - "validate='array-or-frame' in 0.22. A pandas " - "DataFrame will not be converted to a 2D " - "NumPy array.", FutureWarning) + warnings.warn("The default validate=True will be replaced by " + "validate=False in 0.22.", FutureWarning) else: self._validate = self.validate - if self._validate not in (True, False, 'array-or-frame'): - raise ValueError("'validate' should be a boolean or " - "'array-or-frame'. Got {!r} instead." - .format(self._validate)) - if ((not isinstance(self.force_all_finite, bool)) and - self.force_all_finite != 'allow-nan'): - raise ValueError("'force_all_finite' should be a boolean " - "or 'allow-nan'. Got {!r} instead." - .format(self.force_all_finite)) - if self._validate: - if hasattr(X, 'loc') and self._validate == 'array-or-frame': - if self.force_all_finite: - _assert_all_finite( - X.values, - allow_nan=not (self.force_all_finite is True)) - return X - else: - return check_array(X, accept_sparse=self.accept_sparse, - force_all_finite=self.force_all_finite) + return check_array(X, accept_sparse=self.accept_sparse) + else: + # convert X to NumPy array when this is a list + if isinstance(X, list): + return np.asarray(X) return X def _check_inverse_transform(self, X): diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index f8b03a58ee429..9b820ed48e568 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -173,49 +173,14 @@ def test_check_inverse(): assert_no_warnings(trans.fit, X_dense) -@pytest.mark.parametrize( - "X", - [np.array([[0, 1], [2, 3]]), - np.array([[0, 1], [2, np.nan]]), - np.array([[0, 1], [2, np.inf]]), - np.array([[0, 1], [np.inf, np.nan]])] -) -@pytest.mark.parametrize("force_all_finite", [True, False, 'allow-nan']) -def test_function_transformer_finiteness_pandas(X, force_all_finite): - pd = pytest.importorskip('pandas') - X_df = pd.DataFrame(X) - - def func(X): - return X.columns - - transformer = FunctionTransformer(force_all_finite=force_all_finite, - validate=True) - - should_fail = False - if force_all_finite is True: - if not np.isfinite(X).all(): - should_fail = True - elif force_all_finite == 'allow-nan': - if np.isinf(X).any(): - should_fail = True - - if should_fail: - with pytest.raises(ValueError, match="Input contains"): - transformer.fit_transform(X_df) - else: - transformer.fit_transform(X_df) - - -@pytest.mark.parametrize("is_dataframe", [True, False]) -def test_function_transformer_future_warning(is_dataframe): +@pytest.mark.parametrize("validate", [None, True, False]) +def test_function_transformer_future_warning(validate): # FIXME: to be removed in 0.22 X = np.random.randn(100, 10) - transformer = FunctionTransformer() - if is_dataframe: - pd = pytest.importorskip('pandas') - X_df = pd.DataFrame(X) + transformer = FunctionTransformer(validate=validate) + if validate is None: with pytest.warns(FutureWarning): - transformer.fit_transform(X_df) + transformer.fit_transform(X) else: assert_no_warnings(transformer.fit, X) @@ -223,21 +188,6 @@ def test_function_transformer_future_warning(is_dataframe): def test_function_transformer_frame(): pd = pytest.importorskip('pandas') X_df = pd.DataFrame(np.random.randn(100, 10)) - transformer = FunctionTransformer(validate='array-or-frame', - check_inverse=False) + transformer = FunctionTransformer(validate=False) X_df_trans = transformer.fit_transform(X_df) assert hasattr(X_df_trans, 'loc') - - -@pytest.mark.parametrize( - "params, msg_err", - [({'validate': 'random'}, "'validate' should be"), - ({'validate': True, 'force_all_finite': 'random'}, - "'force_all_finite' should be")] -) -def test_function_transformer_params_errors(params, msg_err): - X = np.random.randn(100, 10) - transformer = FunctionTransformer() - transformer.set_params(**params) - with pytest.raises(ValueError, match=msg_err): - transformer.fit_transform(X) From b00d16ab8012268b3180927f80ad193ac56bba79 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 8 Jun 2018 15:20:47 +0200 Subject: [PATCH 08/15] TST ensure the conversion to array for validate=False --- sklearn/preprocessing/tests/test_function_transformer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 9b820ed48e568..61a73a34f4fa9 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -191,3 +191,11 @@ def test_function_transformer_frame(): transformer = FunctionTransformer(validate=False) X_df_trans = transformer.fit_transform(X_df) assert hasattr(X_df_trans, 'loc') + + +def test_function_transformer_list(): + # test that we convert a list to a NumPy array when validate=False + X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + transformer = FunctionTransformer(validate=False) + X_trans = transformer.fit_transform(X) + assert isinstance(X_trans, np.ndarray) From d065f20e096c8c3b02cac9cb8f519f5e949a4ac9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 8 Jun 2018 15:23:17 +0200 Subject: [PATCH 09/15] DOC update whats new --- doc/whats_new/v0.20.rst | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 1b04e64734d8c..72d40ac246c0c 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -260,12 +260,7 @@ Miscellaneous :issue:`9101` by :user:`alex-33 ` and :user:`Maskani Filali Mohamed `. -- :class:`preprocessing.FunctionTransformer` now accepts pandas DataFrame in - ``func`` without converting to a NumPy array when - ``validate='array-or-frame``. In addition the parameter ``force_all_finite`` - was added to let pass NaN. :issue:`10655` by :user:`Guillaume Lemaitre - `. - + Bug fixes ......... @@ -579,7 +574,7 @@ Misc :issue:`#10982` by :user:`Juliet Lawton ` - In :class:`preprocessing.FunctionTransformer`, the default of ``validate`` - will be from ``'array-or-frame'`` in 0.22. :issue:`10655` by + will be from ``False`` in 0.22. :issue:`10655` by :user:`Guillaume Lemaitre `. Changes to estimator checks From 7830de499336058c42e12783abc1cb8c356f6175 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Jun 2018 15:55:40 +0200 Subject: [PATCH 10/15] FIX do not perform list conversion to array --- doc/whats_new/v0.20.rst | 5 ++--- sklearn/preprocessing/_function_transformer.py | 11 +++-------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 1ded25828e12e..e35fcb3e3aeff 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -645,9 +645,8 @@ Misc :issue:`10928` by :user:`Solutus Immensus ` - In :class:`preprocessing.FunctionTransformer`, the default of ``validate`` - will be from ``True`` to ``False`` in 0.22. ``validate=False`` also convert - ``X`` to a 2D NumPy array when given as a list. :issue:`10655` by - :user:`Guillaume Lemaitre `. + will be from ``True`` to ``False`` in 0.22. + :issue:`10655` by :user:`Guillaume Lemaitre `. Changes to estimator checks --------------------------- diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index df375a2ad1b7f..b04be45563d37 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -41,12 +41,11 @@ class FunctionTransformer(BaseEstimator, TransformerMixin): kwargs forwarded. If inverse_func is None, then inverse_func will be the identity function. - validate : bool or 'array-or-frame', optional default=True + validate : bool, optional default=True Indicate that the input X array should be checked before calling - func. The possibilities are: + ``func``. The possibilities are: - - If False, there is no input validation. If and only if X is a list, - it will be converted to a 2-dimensional NumPy array. + - If False, there is no input validation. - If True, then X will be converted to a 2-dimensional NumPy array or sparse matrix. If the conversion is not possible an exception is raised. @@ -105,10 +104,6 @@ def _check_input(self, X): if self._validate: return check_array(X, accept_sparse=self.accept_sparse) - else: - # convert X to NumPy array when this is a list - if isinstance(X, list): - return np.asarray(X) return X def _check_inverse_transform(self, X): From 7cd8d8a78682e24e556e65cf5e431e3d19da0227 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 13 Jun 2018 14:46:22 +0200 Subject: [PATCH 11/15] TST fix the tests and simplified using pytest --- .../tests/test_function_transformer.py | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 61a73a34f4fa9..170fff8e4221f 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -173,16 +173,16 @@ def test_check_inverse(): assert_no_warnings(trans.fit, X_dense) -@pytest.mark.parametrize("validate", [None, True, False]) -def test_function_transformer_future_warning(validate): +@pytest.mark.parametrize("validate, expected_warning", + [(None, FutureWarning), + (True, None), + (False, None)]) +def test_function_transformer_future_warning(validate, expected_warning): # FIXME: to be removed in 0.22 X = np.random.randn(100, 10) transformer = FunctionTransformer(validate=validate) - if validate is None: - with pytest.warns(FutureWarning): - transformer.fit_transform(X) - else: - assert_no_warnings(transformer.fit, X) + with pytest.warns(expected_warning): + transformer.fit_transform(X) def test_function_transformer_frame(): @@ -191,11 +191,3 @@ def test_function_transformer_frame(): transformer = FunctionTransformer(validate=False) X_df_trans = transformer.fit_transform(X_df) assert hasattr(X_df_trans, 'loc') - - -def test_function_transformer_list(): - # test that we convert a list to a NumPy array when validate=False - X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - transformer = FunctionTransformer(validate=False) - X_trans = transformer.fit_transform(X) - assert isinstance(X_trans, np.ndarray) From 7d8ce92b28862dbc434d68af9e1c21f007372a6d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 13 Jun 2018 15:08:35 +0200 Subject: [PATCH 12/15] PEP8 --- sklearn/preprocessing/_function_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index b04be45563d37..b4ff0118d6f4c 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -1,5 +1,4 @@ import warnings -import numpy as np from ..base import BaseEstimator, TransformerMixin from ..utils import check_array From f1d9276eb512845107bc30d0da25e15b8f8e8a27 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 13 Jun 2018 23:36:12 +0200 Subject: [PATCH 13/15] TST ensure not warning raised --- sklearn/preprocessing/tests/test_function_transformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 170fff8e4221f..0bd57a859649f 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -181,8 +181,10 @@ def test_function_transformer_future_warning(validate, expected_warning): # FIXME: to be removed in 0.22 X = np.random.randn(100, 10) transformer = FunctionTransformer(validate=validate) - with pytest.warns(expected_warning): + with pytest.warns(expected_warning) as results: transformer.fit_transform(X) + if expected_warning is None: + assert len(results) == 0 def test_function_transformer_frame(): From 22bcd780b6dfc9acf601d7a2b3489398a388ec51 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 9 Jul 2018 23:56:19 +0200 Subject: [PATCH 14/15] address comments --- doc/modules/preprocessing.rst | 2 +- doc/whats_new/v0.20.rst | 3 ++- sklearn/preprocessing/_function_transformer.py | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 83c94d094d01a..19bdfc0d432a0 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -663,7 +663,7 @@ error with a ``filterwarnings``:: >>> import warnings >>> warnings.filterwarnings("error", message=".*check_inverse*.", ... category=UserWarning, append=False) - + For a full code example that demonstrates using a :class:`FunctionTransformer` to do custom feature selection, see :ref:`sphx_glr_auto_examples_preprocessing_plot_function_transformer.py` diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index e35fcb3e3aeff..9467b6d620d36 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -301,7 +301,6 @@ Miscellaneous :issue:`9101` by :user:`alex-33 ` and :user:`Maskani Filali Mohamed `. - Bug fixes ......... @@ -644,6 +643,8 @@ Misc - Invalid input for :class:`model_selection.ParameterGrid` now raises TypeError. :issue:`10928` by :user:`Solutus Immensus ` +Preprocessing + - In :class:`preprocessing.FunctionTransformer`, the default of ``validate`` will be from ``True`` to ``False`` in 0.22. :issue:`10655` by :user:`Guillaume Lemaitre `. diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index b4ff0118d6f4c..3e0f69ff127d8 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -86,7 +86,6 @@ def __init__(self, func=None, inverse_func=None, validate=None, self.inverse_func = inverse_func self.validate = validate self.accept_sparse = accept_sparse - self.force_all_finite = force_all_finite self.pass_y = pass_y self.check_inverse = check_inverse self.kw_args = kw_args From 1e45d505bd8cf315fecdd3316c6fc02059aa2cc3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 10 Jul 2018 04:13:38 +0200 Subject: [PATCH 15/15] gotcha --- sklearn/preprocessing/_function_transformer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 3e0f69ff127d8..0c79543338212 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -79,9 +79,8 @@ class FunctionTransformer(BaseEstimator, TransformerMixin): """ def __init__(self, func=None, inverse_func=None, validate=None, - accept_sparse=False, force_all_finite=True, - pass_y='deprecated', check_inverse=True, kw_args=None, - inv_kw_args=None): + accept_sparse=False, pass_y='deprecated', check_inverse=True, + kw_args=None, inv_kw_args=None): self.func = func self.inverse_func = inverse_func self.validate = validate