From d0f8d60a8654747f2f5600e3fae9c071c1c417aa Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 29 Jul 2019 18:31:25 +0200 Subject: [PATCH 1/7] FIX change boolean array-likes indexing in old NumPy version --- sklearn/compose/tests/test_column_transformer.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index ae7ef31d6c7f1..2ccfd6d6c2eae 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -16,6 +16,7 @@ from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer, make_column_transformer from sklearn.exceptions import NotFittedError +from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder from sklearn.feature_extraction import DictVectorizer @@ -1108,3 +1109,14 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname): err_msg = 'Specifying the columns' with pytest.raises(ValueError, match=err_msg): tf.transform(X_array) + + +def test_column_transformer_mask_indexing(): + # Regression test for #xxxxx + # Boolean mask indexing with NumPy < 1.13 + X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]]) + column_transformer = ColumnTransformer( + [('identity', FunctionTransformer(), [False, True, False, True])] + ) + X_trans = column_transformer.fit_transform(X) + assert X_trans.shape == (3, 2) From f95a228e5444d801b6ab09d30dcc655d347cc663 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 29 Jul 2019 19:12:10 +0200 Subject: [PATCH 2/7] change indexing --- sklearn/utils/__init__.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index efcaf6865faa5..8db41bb27986d 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -18,6 +18,7 @@ from . import _joblib from ..exceptions import DataConversionWarning from .deprecation import deprecated +from .fixes import np_version from .validation import (as_float_array, assert_all_finite, check_random_state, column_or_1d, check_array, @@ -225,6 +226,17 @@ def safe_indexing(X, indices, axis=0): ) +# FIXME: to be removed once NumPy 1.13 is the minimum version required +def _array_indexing(array, key, axis=0): + """Index an array consistently across NumPy version.""" + if np_version < (1, 13): + # check if we have an boolean array-likes to make the proper indexing + key_array = np.asarray(key) + if np.issubdtype(key_array.dtype, np.bool_): + key = key_array + return array[key] if axis == 0 else array[:, key] + + def _safe_indexing_row(X, indices): """Return items or rows from X using indices. @@ -266,7 +278,7 @@ def _safe_indexing_row(X, indices): # This is often substantially faster than X[indices] return X.take(indices, axis=0) else: - return X[indices] + return _array_indexing(X, indices, axis=0) else: return [X[idx] for idx in indices] @@ -356,7 +368,7 @@ def _safe_indexing_column(X, key): return X.iloc[:, key] else: # numpy arrays, sparse arrays - return X[:, key] + return _array_indexing(X, key, axis=1) def _get_column_indices(X, key): @@ -371,7 +383,7 @@ def _get_column_indices(X, key): or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)): # Convert key into positive indexes try: - idx = np.arange(n_columns)[key] + idx = safe_indexing(np.arange(n_columns), key) except IndexError as e: raise ValueError( 'all features must be in [0, %d]' % (n_columns - 1) From 1c8180390799d22bf42b5c1673caf0cb3dd71c79 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 30 Jul 2019 09:54:22 +0200 Subject: [PATCH 3/7] add regression test in utils --- doc/whats_new/v0.22.rst | 8 +++++++ .../compose/tests/test_column_transformer.py | 4 ++-- sklearn/utils/tests/test_utils.py | 22 +++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 93635d88069d5..114afb9185a18 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -61,6 +61,14 @@ Changelog `sample_weights` are not supported by the wrapped estimator). :pr:`13575` by :user:`William de Vazelhes `. +:mod:`sklearn.compose` +...................... + +- |Fix| Fixed a bug in :class:`compose.ColumnTransformer` which failed to + select the proper columns when using a boolean list and NumPy older than + 1.13. + :pr:`14510` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.datasets` ....................... diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 2ccfd6d6c2eae..a9c4fd9e25fbe 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1112,8 +1112,8 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname): def test_column_transformer_mask_indexing(): - # Regression test for #xxxxx - # Boolean mask indexing with NumPy < 1.13 + # Regression test for #14510 + # Boolean array-like does not behave as boolean array with NumPy < 1.13 X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]]) column_transformer = ColumnTransformer( [('identity', FunctionTransformer(), [False, True, False, True])] diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index a39e8160047a5..35cfde4aaef7d 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -10,6 +10,7 @@ from sklearn.utils.testing import (assert_raises, assert_array_equal, + assert_allclose_dense_sparse, assert_raises_regex, assert_warns_message, assert_no_warnings) from sklearn.utils import check_random_state @@ -365,6 +366,27 @@ def test_safe_indexing_mock_pandas(asarray): assert_array_equal(np.array(X_df_indexed), X_indexed) +@pytest.mark.parametrize("array_type", ['array', 'sparse', 'dataframe']) +def test_safe_indexing_mask_axis_1(array_type): + # regression test for #14510 + # check that boolean array-like and boolean array lead to the same indexing + # even in NumPy < 1.13 + if array_type == 'array': + array_constructor = np.asarray + elif array_type == 'sparse': + array_constructor = sp.csr_matrix + elif array_type == 'dataframe': + pd = pytest.importorskip('pandas') + array_constructor = pd.DataFrame + + X = array_constructor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + mask = [True, False, True] + mask_array = np.array(mask) + X_masked = safe_indexing(X, mask, axis=1) + X_masked_array = safe_indexing(X, mask_array, axis=1) + assert_allclose_dense_sparse(X_masked, X_masked_array) + + def test_shuffle_on_ndim_equals_three(): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) From c8009a28aa2855e8e01cfebd3ef5df337c0536f4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 30 Jul 2019 12:17:43 +0200 Subject: [PATCH 4/7] fix --- sklearn/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 8db41bb27986d..ac6446afcd6cf 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -229,7 +229,7 @@ def safe_indexing(X, indices, axis=0): # FIXME: to be removed once NumPy 1.13 is the minimum version required def _array_indexing(array, key, axis=0): """Index an array consistently across NumPy version.""" - if np_version < (1, 13): + if np_version < (1, 13) or issparse(array): # check if we have an boolean array-likes to make the proper indexing key_array = np.asarray(key) if np.issubdtype(key_array.dtype, np.bool_): From a80b33d7a60d67706150d48f71b3d0837fceba38 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 30 Jul 2019 14:25:31 +0200 Subject: [PATCH 5/7] add test in column transformer --- sklearn/compose/tests/test_column_transformer.py | 5 ++++- sklearn/utils/__init__.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index a9c4fd9e25fbe..a667b35cf65e3 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1111,10 +1111,13 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname): tf.transform(X_array) -def test_column_transformer_mask_indexing(): +@pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix]) +def test_column_transformer_mask_indexing(array_type): # Regression test for #14510 # Boolean array-like does not behave as boolean array with NumPy < 1.13 + # and sparse matrices as well X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]]) + X = array_type(X) column_transformer = ColumnTransformer( [('identity', FunctionTransformer(), [False, True, False, True])] ) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index ac6446afcd6cf..83f4d7fd1876c 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -226,7 +226,6 @@ def safe_indexing(X, indices, axis=0): ) -# FIXME: to be removed once NumPy 1.13 is the minimum version required def _array_indexing(array, key, axis=0): """Index an array consistently across NumPy version.""" if np_version < (1, 13) or issparse(array): From 9fb045dcf1b7923bf06021b6944ca0cb3dd8ad40 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Aug 2019 14:22:46 +0200 Subject: [PATCH 6/7] raise error if axis not 0 or 1 --- sklearn/utils/__init__.py | 5 +++++ sklearn/utils/tests/test_utils.py | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 83f4d7fd1876c..3b4a20d08716b 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -228,6 +228,11 @@ def safe_indexing(X, indices, axis=0): def _array_indexing(array, key, axis=0): """Index an array consistently across NumPy version.""" + if axis not in (0, 1): + raise ValueError( + "'axis' should be either 0 (to index rows) or 1 (to index " + " column). Got {} instead.".format(axis) + ) if np_version < (1, 13) or issparse(array): # check if we have an boolean array-likes to make the proper indexing key_array = np.asarray(key) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 35cfde4aaef7d..49f50eedc0a42 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -13,6 +13,7 @@ assert_allclose_dense_sparse, assert_raises_regex, assert_warns_message, assert_no_warnings) +from sklearn.utils import _array_indexing from sklearn.utils import check_random_state from sklearn.utils import _check_key_type from sklearn.utils import deprecated @@ -387,6 +388,13 @@ def test_safe_indexing_mask_axis_1(array_type): assert_allclose_dense_sparse(X_masked, X_masked_array) +def test_array_indexing_array_error(): + X = np.array([[0, 1], [2, 3]]) + mask = [True, False] + with pytest.raises(ValueError, match="'axis' should be either 0"): + _array_indexing(X, mask, axis=3) + + def test_shuffle_on_ndim_equals_three(): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) From b1918e83de705b97b789d970ecd471903141182c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 2 Aug 2019 15:33:41 +0200 Subject: [PATCH 7/7] address different comments --- doc/whats_new/v0.22.rst | 4 ++-- sklearn/compose/tests/test_column_transformer.py | 2 +- sklearn/utils/__init__.py | 2 +- sklearn/utils/tests/test_utils.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index db9e0e574da06..0f3c5665e3aa6 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -65,8 +65,8 @@ Changelog ...................... - |Fix| Fixed a bug in :class:`compose.ColumnTransformer` which failed to - select the proper columns when using a boolean list and NumPy older than - 1.13. + select the proper columns when using a boolean list, with NumPy older than + 1.12. :pr:`14510` by :user:`Guillaume Lemaitre `. :mod:`sklearn.datasets` diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index a667b35cf65e3..d28a82374ad5b 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1114,7 +1114,7 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname): @pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix]) def test_column_transformer_mask_indexing(array_type): # Regression test for #14510 - # Boolean array-like does not behave as boolean array with NumPy < 1.13 + # Boolean array-like does not behave as boolean array with NumPy < 1.12 # and sparse matrices as well X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]]) X = array_type(X) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 3b4a20d08716b..f95a0d6cccc57 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -233,7 +233,7 @@ def _array_indexing(array, key, axis=0): "'axis' should be either 0 (to index rows) or 1 (to index " " column). Got {} instead.".format(axis) ) - if np_version < (1, 13) or issparse(array): + if np_version < (1, 12) or issparse(array): # check if we have an boolean array-likes to make the proper indexing key_array = np.asarray(key) if np.issubdtype(key_array.dtype, np.bool_): diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 49f50eedc0a42..806295f1aae28 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -371,7 +371,7 @@ def test_safe_indexing_mock_pandas(asarray): def test_safe_indexing_mask_axis_1(array_type): # regression test for #14510 # check that boolean array-like and boolean array lead to the same indexing - # even in NumPy < 1.13 + # even in NumPy < 1.12 if array_type == 'array': array_constructor = np.asarray elif array_type == 'sparse':