From 3f9c591ec25929719acac7a2fd40ec7334dc2ff2 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Thu, 12 Sep 2019 09:58:41 +0200 Subject: [PATCH 01/16] label encode with nan and mixed types --- sklearn/preprocessing/_encoders.py | 10 +- sklearn/preprocessing/label.py | 101 ++++++++++++++++--- sklearn/preprocessing/tests/test_encoders.py | 62 +++++++----- sklearn/preprocessing/tests/test_label.py | 13 +++ 4 files changed, 140 insertions(+), 46 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index c33744204fc36..9104d1028a1b8 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -26,7 +26,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin): """ - def _check_X(self, X): + def _check_X(self, X, force_all_finite=True): """ Perform custom check_array: - convert list of strings to object dtype @@ -40,10 +40,10 @@ def _check_X(self, X): """ if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2): # if not a dataframe, do normal check_array validation - X_temp = check_array(X, dtype=None) + X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite) if (not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_)): - X = check_array(X, dtype=np.object) + X = check_array(X, dtype=np.object, force_all_finite=force_all_finite) else: X = X_temp needs_validation = False @@ -71,7 +71,7 @@ def _get_feature(self, X, feature_idx): return X[:, feature_idx] def _fit(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) + X_list, n_samples, n_features = self._check_X(X, force_all_finite='allow-nan') if self.categories != 'auto': if len(self.categories) != n_features: @@ -99,7 +99,7 @@ def _fit(self, X, handle_unknown='error'): self.categories_.append(cats) def _transform(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) + X_list, n_samples, n_features = self._check_X(X, force_all_finite=('allow-nan')) X_int = np.zeros((n_samples, n_features), dtype=np.int) X_mask = np.ones((n_samples, n_features), dtype=np.bool) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f16c7588fe13c..3dccab59cc5bc 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -18,12 +18,14 @@ from ..utils.sparsefuncs import min_max_axis from ..utils import column_or_1d +from ..utils import is_scalar_nan +from ..utils.fixes import _object_dtype_isnan from ..utils.validation import check_array from ..utils.validation import check_is_fitted from ..utils.validation import _num_samples from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target - +from ..impute._base import _get_mask __all__ = [ 'label_binarize', @@ -33,8 +35,17 @@ ] +def get_encoding(uniques, values): + if np.diff(uniques) > 0: + return np.searchsorted(uniques, values) + else: + table = {val: i for i, val in enumerate(uniques)} + return np.array([table[v] for v in values]) + + def _encode_numpy(values, uniques=None, encode=False, check_unknown=True): # only used in _encode below, see docstring there for details + # excpect that `values` and `uniques` do not contains nan if uniques is None: if encode: uniques, encoded = np.unique(values, return_inverse=True) @@ -48,16 +59,20 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True): if diff: raise ValueError("y contains previously unseen labels: %s" % str(diff)) - encoded = np.searchsorted(uniques, values) + encoded = get_encoding(uniques, values) return uniques, encoded else: return uniques def _encode_python(values, uniques=None, encode=False): - # only used in _encode below, see docstring there for details + # only used in _encode below, see docstring there for details. if uniques is None: - uniques = sorted(set(values)) + try: + uniques = sorted(set(values)) + except TypeError: + # Couldn't sort with mixed type (str and float) + uniques = (set(values)) uniques = np.array(uniques, dtype=values.dtype) if encode: table = {val: i for i, val in enumerate(uniques)} @@ -71,15 +86,51 @@ def _encode_python(values, uniques=None, encode=False): return uniques +def _encode_python_with_nan(values, uniques=None, encode=False): + # only used in _encode below, see docstring there for details + if uniques is None: + missing_vals = _get_mask(values, np.nan) + assert np.any(missing_vals) + # set([nan, nan]) = {nan, nan} + uniques = set(values[~missing_vals]) | {np.nan} + uniques = np.array(uniques, dtype=values.dtype) + if encode: + table = dict() + for i, val in enumerate(uniques): + if is_scalar_nan(val): + # table[nan] always raise KeyError + nan_index = i + else: + table[val] = i + try: + encoded = [] + for val in values: + if is_scalar_nan(val): + encoded.append(nan_index) + else: + encoded.append(table[val]) + encoded = np.array(encoded) + except KeyError as e: + raise ValueError("y contains previously unseen labels: %s" + % str(e)) + return uniques, encoded + else: + return uniques + + +def _encode_numpy_with_nan(values, uniques=None, encode=False, check_unknown=True): + # `np.unique` does not work here + _encode_python_with_nan(values, uniques, encode) + + def _encode(values, uniques=None, encode=False, check_unknown=True): """Helper function to factorize (find uniques) and encode values. - Uses pure python method for object dtype, and numpy method for - all other dtypes. - The numpy method has the limitation that the `uniques` need to - be sorted. Importantly, this is not checked but assumed to already be - the case. The calling method needs to ensure this for all non-object - values. + Uses pure python method for object dtype or if values contains nan, + and numpy method for all other dtypes. + If values contains nan or mixed type (e.g. str and float) + sorted become meaningless (but still nice to have since it + speed up the `get_encoding`) Parameters ---------- @@ -107,16 +158,30 @@ def _encode(values, uniques=None, encode=False, check_unknown=True): If ``encode=True``. """ + nan_in_uniques = False + # TODO use instead _assert_all_finite + if uniques is not None and np.any(_get_mask(uniques, np.nan)): + nan_in_uniques = True + if values.dtype == object: try: - res = _encode_python(values, uniques, encode) + if np.any(_object_dtype_isnan(values)) or \ + nan_in_uniques: + res = _encode_python_with_nan(values, uniques, encode) + else: + res = _encode_python(values, uniques, encode) except TypeError: raise TypeError("argument must be a string or number") return res else: - return _encode_numpy(values, uniques, encode, + if (values.dtype.kind == 'f' and np.isnan(values)) or \ + nan_in_uniques: + # couldn't use `_encode_numpy` if `values` contains nan + res = _encode_python_with_nan(values, uniques, encode) + else: + res = _encode_numpy(values, uniques, encode, check_unknown=check_unknown) - + return res def _encode_check_unknown(values, uniques, return_mask=False): """ @@ -147,6 +212,11 @@ def _encode_check_unknown(values, uniques, return_mask=False): if values.dtype == object: uniques_set = set(uniques) diff = list(set(values) - uniques_set) + # set([np.nan]) - set([np.nan]) returns set([np.nan]) + if diff and any(_object_dtype_isnan(diff)): + if any(_object_dtype_isnan(uniques_set)) and\ + any(_object_dtype_isnan(set(values))): + diff = diff[~_object_dtype_isnan(diff)] if return_mask: if diff: valid_mask = np.array([val in uniques_set for val in values]) @@ -158,6 +228,11 @@ def _encode_check_unknown(values, uniques, return_mask=False): else: unique_values = np.unique(values) diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True)) + # np.setdiff1d([np.nan],[np.nan]) returns [np.nan] + if any(is_scalar_nan(diff)): + if any(is_scalar_nan(unique_values)) and\ + any(is_scalar_nan(uniques)): + diff = [x for x in diff if not is_scalar_nan(x)] if return_mask: if diff: valid_mask = np.in1d(values, uniques) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 8e1a61781544a..1a11e287520ee 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -220,6 +220,31 @@ def test_one_hot_encoder(X): assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]]) +@pytest.mark.parametrize("X", [ + [['def', 1, np.nan], ['abc', 2, np.nan]], + np.array([[10, 1, np.nan], [5, 2, np.nan]]), + np.array([['b', 'A', np.nan], ['a', 'B', np.nan]], dtype=object) + ], ids=['mixed', 'numeric', 'object']) +def test_one_hot_encoder_with_nan(X): + Xtr = check_categorical_onehot(np.array(X)[:, [0]]) + assert_allclose(Xtr, [[0, 1], [1, 0]]) + + Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) + assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]]) + + Xtr = OneHotEncoder(categories='auto').fit_transform(X) + assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]]) + +def test_ohe_handle_unknow_sparse_nan(): + # TODO + pass + + +def test_ohe_handle_unknow_nan(): + # TODO + pass + + @pytest.mark.parametrize('sparse_', [False, True]) @pytest.mark.parametrize('drop', [None, 'first']) def test_one_hot_encoder_inverse(sparse_, drop): @@ -444,18 +469,14 @@ def test_one_hot_encoder_feature_names_drop(drop, expected_names): @pytest.mark.parametrize("as_data_frame", [False, True], ids=['array', 'dataframe']) @pytest.mark.parametrize("handle_unknown", ['error', 'ignore']) -def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown): +def test_one_hot_encoder_accept_nan(X, as_data_frame, handle_unknown): if as_data_frame: pd = pytest.importorskip('pandas') X = pd.DataFrame(X) ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.fit(X) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.fit_transform(X) + ohe.fit(X) + ohe.fit_transform(X) if as_data_frame: X_partial = X.iloc[:1, :] @@ -463,9 +484,7 @@ def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown): X_partial = X[:1, :] ohe.fit(X_partial) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.transform(X) + ohe.transform(X) @pytest.mark.parametrize("X", [ @@ -523,24 +542,6 @@ def test_ordinal_encoder_inverse(): assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) -@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, - np.array([['a', np.nan]], dtype=object).T], - ids=['numeric', 'object']) -def test_ordinal_encoder_raise_missing(X): - ohe = OrdinalEncoder() - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.fit(X) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.fit_transform(X) - - ohe.fit(X[:1, :]) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.transform(X) - - def test_ordinal_encoder_raise_categories_shape(): X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T @@ -601,6 +602,11 @@ def test_one_hot_encoder_warning(): X = [['Male', 1], ['Female', 3]] np.testing.assert_no_warnings(enc.fit_transform, X) +def test_one_hot_encoder_accept_nan(): + enc = OneHotEncoder() + X = [[np.nan, 1], ['Female', np.nan]] + enc.fit_transform(X) + def test_one_hot_encoder_drop_manual(): cats_to_drop = ['def', 12, 3, 56] diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index a095f4ec64cab..5ce40941fc106 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -206,6 +206,19 @@ def test_label_encoder_negative_ints(): [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6]) +def test_label_encode_with_nan(): + assert len(_encode(np.asarray([np.nan, np.nan], dtype=float))) == 1 + assert len(_encode(np.asarray([np.nan, np.nan], dtype=object))) == 1 + assert len(_encode(np.asarray([4, 'm', np.nan]))) == 3 + assert len(_encode(np.asarray([4, np.nan]))) == 2 + + assert len(_encode(np.asarray([np.nan, np.nan],dtype=float), encode=True)[1]) == 2 + assert len(_encode(np.asarray([np.nan, np.nan],dtype=object), encode=True)[1]) == 2 + assert len(_encode(np.asarray([4, 'm', np.nan, np.nan, np.nan]), encode=True)[1]) == 5 + assert len(_encode(np.asarray([4, np.nan, np.nan, np.nan]), encode=True)[1]) == 4 + +def test_label_encode_with_mixed_type(): + assert len(_encode(np.asarray([4, 'm']))) == 2 @pytest.mark.parametrize("dtype", ['str', 'object']) def test_label_encoder_str_bad_shape(dtype): From 49448c1b23f9115822b5af34295719ad68e5e381 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 18 Sep 2019 09:30:21 +0200 Subject: [PATCH 02/16] chg noly label.py --- sklearn/preprocessing/label.py | 96 +++++++------------- sklearn/preprocessing/tests/test_encoders.py | 5 - sklearn/preprocessing/tests/test_label.py | 22 +++-- 3 files changed, 44 insertions(+), 79 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 3dccab59cc5bc..b8c801052fceb 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -43,9 +43,14 @@ def get_encoding(uniques, values): return np.array([table[v] for v in values]) -def _encode_numpy(values, uniques=None, encode=False, check_unknown=True): +def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, + allow_nan=False): # only used in _encode below, see docstring there for details - # excpect that `values` and `uniques` do not contains nan + + if allow_nan: + # `np.unique` does not work here + return _encode_python(values, uniques, encode, + allow_nan) if uniques is None: if encode: uniques, encoded = np.unique(values, return_inverse=True) @@ -59,40 +64,21 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True): if diff: raise ValueError("y contains previously unseen labels: %s" % str(diff)) - encoded = get_encoding(uniques, values) + encoded = np.searchsorted(uniques, values) return uniques, encoded else: return uniques -def _encode_python(values, uniques=None, encode=False): - # only used in _encode below, see docstring there for details. - if uniques is None: - try: - uniques = sorted(set(values)) - except TypeError: - # Couldn't sort with mixed type (str and float) - uniques = (set(values)) - uniques = np.array(uniques, dtype=values.dtype) - if encode: - table = {val: i for i, val in enumerate(uniques)} - try: - encoded = np.array([table[v] for v in values]) - except KeyError as e: - raise ValueError("y contains previously unseen labels: %s" - % str(e)) - return uniques, encoded - else: - return uniques - - -def _encode_python_with_nan(values, uniques=None, encode=False): +def _encode_python(values, uniques=None, encode=False, allow_nan=False): # only used in _encode below, see docstring there for details if uniques is None: - missing_vals = _get_mask(values, np.nan) - assert np.any(missing_vals) - # set([nan, nan]) = {nan, nan} - uniques = set(values[~missing_vals]) | {np.nan} + if allow_nan: + missing_mask = _get_mask(values, np.nan) + if np.any(missing_mask): + uniques = sorted(set(values[~missing_mask]) | {np.nan}) + else: + uniques = sorted(set(values)) uniques = np.array(uniques, dtype=values.dtype) if encode: table = dict() @@ -118,19 +104,12 @@ def _encode_python_with_nan(values, uniques=None, encode=False): return uniques -def _encode_numpy_with_nan(values, uniques=None, encode=False, check_unknown=True): - # `np.unique` does not work here - _encode_python_with_nan(values, uniques, encode) - - -def _encode(values, uniques=None, encode=False, check_unknown=True): +def _encode(values, uniques=None, encode=False, check_unknown=True, + allow_nan=False): """Helper function to factorize (find uniques) and encode values. Uses pure python method for object dtype or if values contains nan, and numpy method for all other dtypes. - If values contains nan or mixed type (e.g. str and float) - sorted become meaningless (but still nice to have since it - speed up the `get_encoding`) Parameters ---------- @@ -158,30 +137,16 @@ def _encode(values, uniques=None, encode=False, check_unknown=True): If ``encode=True``. """ - nan_in_uniques = False - # TODO use instead _assert_all_finite - if uniques is not None and np.any(_get_mask(uniques, np.nan)): - nan_in_uniques = True - if values.dtype == object: try: - if np.any(_object_dtype_isnan(values)) or \ - nan_in_uniques: - res = _encode_python_with_nan(values, uniques, encode) - else: - res = _encode_python(values, uniques, encode) + res = _encode_python(values, uniques, encode, allow_nan) except TypeError: raise TypeError("argument must be a string or number") return res else: - if (values.dtype.kind == 'f' and np.isnan(values)) or \ - nan_in_uniques: - # couldn't use `_encode_numpy` if `values` contains nan - res = _encode_python_with_nan(values, uniques, encode) - else: - res = _encode_numpy(values, uniques, encode, - check_unknown=check_unknown) - return res + return _encode_numpy(values, uniques, encode, + check_unknown, allow_nan) + def _encode_check_unknown(values, uniques, return_mask=False): """ @@ -212,11 +177,12 @@ def _encode_check_unknown(values, uniques, return_mask=False): if values.dtype == object: uniques_set = set(uniques) diff = list(set(values) - uniques_set) - # set([np.nan]) - set([np.nan]) returns set([np.nan]) - if diff and any(_object_dtype_isnan(diff)): - if any(_object_dtype_isnan(uniques_set)) and\ - any(_object_dtype_isnan(set(values))): - diff = diff[~_object_dtype_isnan(diff)] + # set([np.nan]) - set([np.nan]) returns set() + # but set(np.array([np.nan])) - set(np.array([np.nan])) return {nan} + if diff and any(_get_mask(diff, np.nan)): + if any(_get_mask(uniques_set, np.nan)) and\ + any(_get_mask(set(values), np.nan)): + diff = diff[~_get_mask(diff, np.nan)] if return_mask: if diff: valid_mask = np.array([val in uniques_set for val in values]) @@ -227,11 +193,11 @@ def _encode_check_unknown(values, uniques, return_mask=False): return diff else: unique_values = np.unique(values) - diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True)) + diff = np.setdiff1d(unique_values, uniques, assume_unique=True) # np.setdiff1d([np.nan],[np.nan]) returns [np.nan] - if any(is_scalar_nan(diff)): - if any(is_scalar_nan(unique_values)) and\ - any(is_scalar_nan(uniques)): + if any(_get_mask(diff, np.nan)): + if any(_get_mask(unique_values, np.nan)) and\ + any(_get_mask(uniques, np.nan)): diff = [x for x in diff if not is_scalar_nan(x)] if return_mask: if diff: diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 1a11e287520ee..d9c5092affd2d 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -602,11 +602,6 @@ def test_one_hot_encoder_warning(): X = [['Male', 1], ['Female', 3]] np.testing.assert_no_warnings(enc.fit_transform, X) -def test_one_hot_encoder_accept_nan(): - enc = OneHotEncoder() - X = [[np.nan, 1], ['Female', np.nan]] - enc.fit_transform(X) - def test_one_hot_encoder_drop_manual(): cats_to_drop = ['def', 12, 3, 56] diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 5ce40941fc106..de5c4a7191a91 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -206,20 +206,24 @@ def test_label_encoder_negative_ints(): [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6]) + def test_label_encode_with_nan(): - assert len(_encode(np.asarray([np.nan, np.nan], dtype=float))) == 1 - assert len(_encode(np.asarray([np.nan, np.nan], dtype=object))) == 1 - assert len(_encode(np.asarray([4, 'm', np.nan]))) == 3 - assert len(_encode(np.asarray([4, np.nan]))) == 2 - - assert len(_encode(np.asarray([np.nan, np.nan],dtype=float), encode=True)[1]) == 2 - assert len(_encode(np.asarray([np.nan, np.nan],dtype=object), encode=True)[1]) == 2 - assert len(_encode(np.asarray([4, 'm', np.nan, np.nan, np.nan]), encode=True)[1]) == 5 - assert len(_encode(np.asarray([4, np.nan, np.nan, np.nan]), encode=True)[1]) == 4 + assert len(_encode(np.asarray([np.nan, np.nan], dtype=float), allow_nan=True)) == 1 + assert len(_encode(np.asarray([np.nan, np.nan], dtype=object), allow_nan=True)) == 1 + assert len(_encode(np.asarray([4, np.nan]), allow_nan=True)) == 2 + + assert len(_encode(np.asarray([np.nan, np.nan], dtype=float), + encode=True, allow_nan=True)[1]) == 2 + assert len(_encode(np.asarray([np.nan, np.nan], dtype=object), + encode=True, allow_nan=True)[1]) == 2 + assert len(_encode(np.asarray([4, np.nan, np.nan, np.nan]), + encode=True, allow_nan=True)[1]) == 4 + def test_label_encode_with_mixed_type(): assert len(_encode(np.asarray([4, 'm']))) == 2 + @pytest.mark.parametrize("dtype", ['str', 'object']) def test_label_encoder_str_bad_shape(dtype): le = LabelEncoder() From 302f3aedf135a37f9bb04124d93bcb2cf60f4575 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 18 Sep 2019 10:59:26 +0200 Subject: [PATCH 03/16] iter, restore _encoders --- sklearn/preprocessing/_encoders.py | 483 ++++++++++++++++--- sklearn/preprocessing/label.py | 72 ++- sklearn/preprocessing/tests/test_encoders.py | 417 +++++++++++----- sklearn/preprocessing/tests/test_label.py | 71 ++- 4 files changed, 818 insertions(+), 225 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 9104d1028a1b8..c1acfbe799485 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -2,14 +2,20 @@ # Joris Van den Bossche # License: BSD 3 clause +import numbers +import warnings + import numpy as np from scipy import sparse +from .. import get_config as _get_config from ..base import BaseEstimator, TransformerMixin from ..utils import check_array -from ..utils.fixes import _argmax +from ..utils import deprecated +from ..utils.fixes import _argmax, _object_dtype_isnan from ..utils.validation import check_is_fitted +from .base import _transform_selected from .label import _encode, _encode_check_unknown @@ -26,7 +32,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin): """ - def _check_X(self, X, force_all_finite=True): + def _check_X(self, X): """ Perform custom check_array: - convert list of strings to object dtype @@ -40,10 +46,10 @@ def _check_X(self, X, force_all_finite=True): """ if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2): # if not a dataframe, do normal check_array validation - X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite) + X_temp = check_array(X, dtype=None) if (not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_)): - X = check_array(X, dtype=np.object, force_all_finite=force_all_finite) + X = check_array(X, dtype=np.object) else: X = X_temp needs_validation = False @@ -71,10 +77,10 @@ def _get_feature(self, X, feature_idx): return X[:, feature_idx] def _fit(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X, force_all_finite='allow-nan') + X_list, n_samples, n_features = self._check_X(X) - if self.categories != 'auto': - if len(self.categories) != n_features: + if self._categories != 'auto': + if len(self._categories) != n_features: raise ValueError("Shape mismatch: if categories is an array," " it has to be of shape (n_features,).") @@ -82,10 +88,10 @@ def _fit(self, X, handle_unknown='error'): for i in range(n_features): Xi = X_list[i] - if self.categories == 'auto': + if self._categories == 'auto': cats = _encode(Xi) else: - cats = np.array(self.categories[i], dtype=Xi.dtype) + cats = np.array(self._categories[i], dtype=Xi.dtype) if Xi.dtype != object: if not np.all(np.sort(cats) == cats): raise ValueError("Unsorted categories are not " @@ -99,19 +105,11 @@ def _fit(self, X, handle_unknown='error'): self.categories_.append(cats) def _transform(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X, force_all_finite=('allow-nan')) + X_list, n_samples, n_features = self._check_X(X) X_int = np.zeros((n_samples, n_features), dtype=np.int) X_mask = np.ones((n_samples, n_features), dtype=np.bool) - if n_features != len(self.categories_): - raise ValueError( - "The number of features in X is different to the number of " - "features of the fitted data. The fitted data had {} features " - "and the X has {} features." - .format(len(self.categories_,), n_features) - ) - for i in range(n_features): Xi = X_list[i] diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], @@ -136,31 +134,26 @@ def _transform(self, X, handle_unknown='error'): Xi = Xi.copy() Xi[~valid_mask] = self.categories_[i][0] - # We use check_unknown=False, since _encode_check_unknown was - # already called above. - _, encoded = _encode(Xi, self.categories_[i], encode=True, - check_unknown=False) + _, encoded = _encode(Xi, self.categories_[i], encode=True) X_int[:, i] = encoded return X_int, X_mask - def _more_tags(self): - return {'X_types': ['categorical']} - class OneHotEncoder(_BaseEncoder): - """Encode categorical features as a one-hot numeric array. + """Encode categorical integer features as a one-hot numeric array. The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') encoding scheme. This creates a binary column for each category and - returns a sparse matrix or dense array (depending on the ``sparse`` - parameter) + returns a sparse matrix or dense array. By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the `categories` manually. + The OneHotEncoder previously assumed that the input features take on + values in the range [0, max(values)). This behaviour is deprecated. This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. @@ -209,6 +202,34 @@ class OneHotEncoder(_BaseEncoder): will be all zeros. In the inverse transform, an unknown category will be denoted as None. + n_values : 'auto', int or array of ints, default='auto' + Number of values per feature. + + - 'auto' : determine value range from training data. + - int : number of categorical values per feature. + Each feature value should be in ``range(n_values)`` + - array : ``n_values[i]`` is the number of categorical values in + ``X[:, i]``. Each feature value should be + in ``range(n_values[i])`` + + .. deprecated:: 0.20 + The `n_values` keyword was deprecated in version 0.20 and will + be removed in 0.22. Use `categories` instead. + + categorical_features : 'all' or array of indices or mask, default='all' + Specify what features are treated as categorical. + + - 'all': All features are treated as categorical. + - array of indices: Array of categorical feature indices. + - mask: Array of length n_features and with dtype=bool. + + Non-categorical features are always stacked to the right of the matrix. + + .. deprecated:: 0.20 + The `categorical_features` keyword was deprecated in version + 0.20 and will be removed in 0.22. + You can use the ``ColumnTransformer`` instead. + Attributes ---------- categories_ : list of arrays @@ -222,6 +243,31 @@ class OneHotEncoder(_BaseEncoder): be dropped for each feature. None if all the transformed features will be retained. + active_features_ : array + Indices for active features, meaning values that actually occur + in the training set. Only available when n_values is ``'auto'``. + + .. deprecated:: 0.20 + The ``active_features_`` attribute was deprecated in version + 0.20 and will be removed in 0.22. + + feature_indices_ : array of shape (n_features,) + Indices to feature ranges. + Feature ``i`` in the original data is mapped to features + from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + (and then potentially masked by ``active_features_`` afterwards) + + .. deprecated:: 0.20 + The ``feature_indices_`` attribute was deprecated in version + 0.20 and will be removed in 0.22. + + n_values_ : array of shape (n_features,) + Maximum number of values per feature. + + .. deprecated:: 0.20 + The ``n_values_`` attribute was deprecated in version + 0.20 and will be removed in 0.22. + Examples -------- Given a dataset with two features, we let the encoder find the unique @@ -231,7 +277,11 @@ class OneHotEncoder(_BaseEncoder): >>> enc = OneHotEncoder(handle_unknown='ignore') >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) - OneHotEncoder(handle_unknown='ignore') + ... # doctest: +ELLIPSIS + ... # doctest: +NORMALIZE_WHITESPACE + OneHotEncoder(categorical_features=None, categories=None, drop=None, + dtype=<... 'numpy.float64'>, handle_unknown='ignore', + n_values=None, sparse=True) >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] @@ -265,27 +315,184 @@ class OneHotEncoder(_BaseEncoder): matrix indicating the presence of a class label. """ - def __init__(self, categories='auto', drop=None, sparse=True, - dtype=np.float64, handle_unknown='error'): + def __init__(self, n_values=None, categorical_features=None, + categories=None, drop=None, sparse=True, dtype=np.float64, + handle_unknown='error'): self.categories = categories self.sparse = sparse self.dtype = dtype self.handle_unknown = handle_unknown + self.n_values = n_values + self.categorical_features = categorical_features self.drop = drop - def _validate_keywords(self): - if self.handle_unknown not in ('error', 'ignore'): - msg = ("handle_unknown should be either 'error' or 'ignore', " - "got {0}.".format(self.handle_unknown)) - raise ValueError(msg) - # If we have both dropped columns and ignored unknown - # values, there will be ambiguous cells. This creates difficulties - # in interpreting the model. - if self.drop is not None and self.handle_unknown != 'error': + # Deprecated attributes + + @deprecated("The ``active_features_`` attribute was deprecated in version " + "0.20 and will be removed 0.22.") + @property + def active_features_(self): + check_is_fitted(self, 'categories_') + return self._active_features_ + + @deprecated("The ``feature_indices_`` attribute was deprecated in version " + "0.20 and will be removed 0.22.") + @property + def feature_indices_(self): + check_is_fitted(self, 'categories_') + return self._feature_indices_ + + @deprecated("The ``n_values_`` attribute was deprecated in version " + "0.20 and will be removed 0.22.") + @property + def n_values_(self): + check_is_fitted(self, 'categories_') + return self._n_values_ + + def _handle_deprecations(self, X): + # internal version of the attributes to handle deprecations + self._n_values = self.n_values + self._categories = getattr(self, '_categories', None) + self._categorical_features = getattr(self, '_categorical_features', + None) + + # user manually set the categories or second fit -> never legacy mode + if self.categories is not None or self._categories is not None: + self._legacy_mode = False + if self.categories is not None: + self._categories = self.categories + + # categories not set -> infer if we need legacy mode or not + elif self.n_values is not None and self.n_values != 'auto': + msg = ( + "Passing 'n_values' is deprecated in version 0.20 and will be " + "removed in 0.22. You can use the 'categories' keyword " + "instead. 'n_values=n' corresponds to " + "'categories=[range(n)] * n_features'." + ) + warnings.warn(msg, DeprecationWarning) + self._legacy_mode = True + + else: # n_values = 'auto' + # n_values can also be None (default to catch usage), so set + # _n_values to 'auto' explicitly + self._n_values = 'auto' + if self.handle_unknown == 'ignore': + # no change in behaviour, no need to raise deprecation warning + self._legacy_mode = False + self._categories = 'auto' + if self.n_values == 'auto': + # user manually specified this + msg = ( + "Passing 'n_values' is deprecated in version 0.20 and " + "will be removed in 0.22. n_values='auto' can be " + "replaced with categories='auto'." + ) + warnings.warn(msg, DeprecationWarning) + else: + # check if we have integer or categorical input + try: + check_array(X, dtype=np.int) + except ValueError: + self._legacy_mode = False + self._categories = 'auto' + else: + if self.drop is None: + msg = ( + "The handling of integer data will change in " + "version 0.22. Currently, the categories are " + "determined based on the range " + "[0, max(values)], while in the future they " + "will be determined based on the unique " + "values.\nIf you want the future behaviour " + "and silence this warning, you can specify " + "\"categories='auto'\".\n" + "In case you used a LabelEncoder before this " + "OneHotEncoder to convert the categories to " + "integers, then you can now use the " + "OneHotEncoder directly." + ) + warnings.warn(msg, FutureWarning) + self._legacy_mode = True + else: + msg = ( + "The handling of integer data will change in " + "version 0.22. Currently, the categories are " + "determined based on the range " + "[0, max(values)], while in the future they " + "will be determined based on the unique " + "values.\n The old behavior is not compatible " + "with the `drop` parameter. Instead, you " + "must manually specify \"categories='auto'\" " + "if you wish to use the `drop` parameter on " + "an array of entirely integer data. This will " + "enable the future behavior." + ) + raise ValueError(msg) + + # if user specified categorical_features -> always use legacy mode + if self.categorical_features is not None: + if (isinstance(self.categorical_features, str) + and self.categorical_features == 'all'): + warnings.warn( + "The 'categorical_features' keyword is deprecated in " + "version 0.20 and will be removed in 0.22. The passed " + "value of 'all' is the default and can simply be removed.", + DeprecationWarning) + else: + if self.categories is not None: + raise ValueError( + "The 'categorical_features' keyword is deprecated, " + "and cannot be used together with specifying " + "'categories'.") + warnings.warn( + "The 'categorical_features' keyword is deprecated in " + "version 0.20 and will be removed in 0.22. You can " + "use the ColumnTransformer instead.", DeprecationWarning) + # Set categories_ to empty list if no categorical columns exist + n_features = X.shape[1] + sel = np.zeros(n_features, dtype=bool) + sel[np.asarray(self.categorical_features)] = True + if sum(sel) == 0: + self.categories_ = [] + self._legacy_mode = True + self._categorical_features = self.categorical_features + else: + self._categorical_features = 'all' + + # Prevents new drop functionality from being used in legacy mode + if self._legacy_mode and self.drop is not None: raise ValueError( - "`handle_unknown` must be 'error' when the drop parameter is " - "specified, as both would create categories that are all " - "zero.") + "The `categorical_features` and `n_values` keywords " + "are deprecated, and cannot be used together " + "with 'drop'.") + + def fit(self, X, y=None): + """Fit OneHotEncoder to X. + + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + The data to determine the categories of each feature. + + Returns + ------- + self + """ + + self._validate_keywords() + + self._handle_deprecations(X) + + if self._legacy_mode: + _transform_selected(X, self._legacy_fit_transform, self.dtype, + self._categorical_features, + copy=True) + return self + else: + self._fit(X, handle_unknown=self.handle_unknown) + self.drop_idx_ = self._compute_drop_idx() + return self def _compute_drop_idx(self): if self.drop is None: @@ -323,22 +530,78 @@ def _compute_drop_idx(self): "'first', None or array of objects, got {}") raise ValueError(msg.format(type(self.drop))) - def fit(self, X, y=None): - """Fit OneHotEncoder to X. + def _validate_keywords(self): + if self.handle_unknown not in ('error', 'ignore'): + msg = ("handle_unknown should be either 'error' or 'ignore', " + "got {0}.".format(self.handle_unknown)) + raise ValueError(msg) + # If we have both dropped columns and ignored unknown + # values, there will be ambiguous cells. This creates difficulties + # in interpreting the model. + if self.drop is not None and self.handle_unknown != 'error': + raise ValueError( + "`handle_unknown` must be 'error' when the drop parameter is " + "specified, as both would create categories that are all " + "zero.") - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - The data to determine the categories of each feature. + def _legacy_fit_transform(self, X): + """Assumes X contains only categorical features.""" + dtype = getattr(X, 'dtype', None) + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("OneHotEncoder in legacy mode cannot handle " + "categories encoded as negative integers. " + "Please set categories='auto' explicitly to " + "be able to use arbitrary integer values as " + "category identifiers.") + n_samples, n_features = X.shape + if (isinstance(self._n_values, str) and + self._n_values == 'auto'): + n_values = np.max(X, axis=0) + 1 + elif isinstance(self._n_values, numbers.Integral): + if (np.max(X, axis=0) >= self._n_values).any(): + raise ValueError("Feature out of bounds for n_values=%d" + % self._n_values) + n_values = np.empty(n_features, dtype=np.int) + n_values.fill(self._n_values) + else: + try: + n_values = np.asarray(self._n_values, dtype=int) + except (ValueError, TypeError): + raise TypeError("Wrong type for parameter `n_values`. Expected" + " 'auto', int or array of ints, got %r" + % type(self._n_values)) + if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is an array," + " it has to be of shape (n_features,).") - Returns - ------- - self - """ - self._validate_keywords() - self._fit(X, handle_unknown=self.handle_unknown) - self.drop_idx_ = self._compute_drop_idx() - return self + self._n_values_ = n_values + self.categories_ = [np.arange(n_val - 1, dtype=dtype) + for n_val in n_values] + n_values = np.hstack([[0], n_values]) + indices = np.cumsum(n_values) + self._feature_indices_ = indices + + column_indices = (X + indices[:-1]).ravel() + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + n_features) + data = np.ones(n_samples * n_features) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + + if (isinstance(self._n_values, str) and + self._n_values == 'auto'): + mask = np.array(out.sum(axis=0)).ravel() != 0 + active_features = np.where(mask)[0] + out = out[:, active_features] + self._active_features_ = active_features + + self.categories_ = [ + np.unique(X[:, i]).astype(dtype) if dtype + else np.unique(X[:, i]) for i in range(n_features)] + + return out if self.sparse else out.toarray() def fit_transform(self, X, y=None): """Fit OneHotEncoder to X, then transform X. @@ -355,23 +618,64 @@ def fit_transform(self, X, y=None): X_out : sparse matrix if sparse=True else a 2-d array Transformed input. """ + self._validate_keywords() - return super().fit_transform(X, y) - def transform(self, X): - """Transform X using one-hot encoding. + self._handle_deprecations(X) - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - The data to encode. + if self._legacy_mode: + return _transform_selected( + X, self._legacy_fit_transform, self.dtype, + self._categorical_features, copy=True) + else: + return self.fit(X).transform(X) + + def _legacy_transform(self, X): + """Assumes X contains only categorical features.""" + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("OneHotEncoder in legacy mode cannot handle " + "categories encoded as negative integers. " + "Please set categories='auto' explicitly to " + "be able to use arbitrary integer values as " + "category identifiers.") + n_samples, n_features = X.shape - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. - """ - check_is_fitted(self) + indices = self._feature_indices_ + if n_features != indices.shape[0] - 1: + raise ValueError("X has different shape than during fitting." + " Expected %d, got %d." + % (indices.shape[0] - 1, n_features)) + + # We use only those categorical features of X that are known using fit. + # i.e lesser than n_values_ using mask. + # This means, if self.handle_unknown is "ignore", the row_indices and + # col_indices corresponding to the unknown categorical feature are + # ignored. + mask = (X < self._n_values_).ravel() + if np.any(~mask): + if self.handle_unknown not in ['error', 'ignore']: + raise ValueError("handle_unknown should be either error or " + "unknown got %s" % self.handle_unknown) + if self.handle_unknown == 'error': + raise ValueError("unknown categorical feature present %s " + "during transform." % X.ravel()[~mask]) + + column_indices = (X + indices[:-1]).ravel()[mask] + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + n_features)[mask] + data = np.ones(np.sum(mask)) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + if (isinstance(self._n_values, str) and + self._n_values == 'auto'): + out = out[:, self._active_features_] + + return out if self.sparse else out.toarray() + + def _transform_new(self, X): + """New implementation assuming categorical input""" # validation of X happens in _check_X called by _transform X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) @@ -406,6 +710,27 @@ def transform(self, X): else: return out + def transform(self, X): + """Transform X using one-hot encoding. + + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + The data to encode. + + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array + Transformed input. + """ + check_is_fitted(self, 'categories_') + if self._legacy_mode: + return _transform_selected(X, self._legacy_transform, self.dtype, + self._categorical_features, + copy=True) + else: + return self._transform_new(X) + def inverse_transform(self, X): """Convert the back data to the original representation. @@ -423,7 +748,10 @@ def inverse_transform(self, X): Inverse transformed array. """ - check_is_fitted(self) + # if self._legacy_mode: + # raise ValueError("only supported for categorical features") + + check_is_fitted(self, 'categories_') X = check_array(X, accept_sparse='csr') n_samples, _ = X.shape @@ -506,7 +834,7 @@ def get_feature_names(self, input_features=None): output_feature_names : array of string, length n_output_features """ - check_is_fitted(self) + check_is_fitted(self, 'categories_') cats = self.categories_ if input_features is None: input_features = ['x%d' % i for i in range(len(cats))] @@ -568,7 +896,8 @@ class OrdinalEncoder(_BaseEncoder): >>> enc = OrdinalEncoder() >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) - OrdinalEncoder() + ... # doctest: +ELLIPSIS + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 3], ['Male', 1]]) @@ -604,6 +933,9 @@ def fit(self, X, y=None): self """ + # base classes uses _categories to deal with deprecations in + # OneHoteEncoder: can be removed once deprecations are removed + self._categories = self.categories self._fit(X) return self @@ -639,7 +971,7 @@ def inverse_transform(self, X): Inverse transformed array. """ - check_is_fitted(self) + check_is_fitted(self, 'categories_') X = check_array(X, accept_sparse='csr') n_samples, _ = X.shape @@ -660,3 +992,6 @@ def inverse_transform(self, X): X_tr[:, i] = self.categories_[i][labels] return X_tr + + def _more_tags(self): + return {'X_types': ['categorical']} diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index b8c801052fceb..3adf40cc519b2 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -47,20 +47,31 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, allow_nan=False): # only used in _encode below, see docstring there for details - if allow_nan: - # `np.unique` does not work here - return _encode_python(values, uniques, encode, - allow_nan) if uniques is None: if encode: uniques, encoded = np.unique(values, return_inverse=True) + # np.nan is always sorted last + if len(uniques) and is_scalar_nan(uniques[-1]): + if not allow_nan: + raise ValueError('nan found in values and allow_nan=False') + nan_idx = np.searchsorted(uniques, np.nan) + uniques = uniques[:nan_idx+1] + if encode: + encoded[encoded > nan_idx] = nan_idx return uniques, encoded else: # unique sorts - return np.unique(values) + uniques = np.unique(values) + # np.nan is always sorted last + if len(uniques) and is_scalar_nan(uniques[-1]): + if not allow_nan: + raise ValueError('nan found in values and allow_nan=False') + nan_idx = np.searchsorted(uniques, np.nan) + uniques = uniques[:nan_idx+1] + return uniques if encode: if check_unknown: - diff = _encode_check_unknown(values, uniques) + diff = _encode_check_unknown(values, uniques, allow_nan=allow_nan) if diff: raise ValueError("y contains previously unseen labels: %s" % str(diff)) @@ -73,10 +84,13 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, def _encode_python(values, uniques=None, encode=False, allow_nan=False): # only used in _encode below, see docstring there for details if uniques is None: - if allow_nan: - missing_mask = _get_mask(values, np.nan) - if np.any(missing_mask): - uniques = sorted(set(values[~missing_mask]) | {np.nan}) + missing_mask = _get_mask(values, np.nan) + if np.any(missing_mask): + if not allow_nan: + raise ValueError('nan found in values and allow_nan=False') + else: + # sorted([4, np.nan]) != np.sort([4, np.nan]) + uniques = np.sort(list(set(values[~missing_mask]) | {np.nan})) else: uniques = sorted(set(values)) uniques = np.array(uniques, dtype=values.dtype) @@ -99,6 +113,11 @@ def _encode_python(values, uniques=None, encode=False, allow_nan=False): except KeyError as e: raise ValueError("y contains previously unseen labels: %s" % str(e)) + except UnboundLocalError as e: + # 'nan_index' referenced before assignment + raise ValueError("y contains previously unseen label nan, " + "consider using allow_nan=True. %s" + % str(e)) return uniques, encoded else: return uniques @@ -108,8 +127,12 @@ def _encode(values, uniques=None, encode=False, check_unknown=True, allow_nan=False): """Helper function to factorize (find uniques) and encode values. - Uses pure python method for object dtype or if values contains nan, - and numpy method for all other dtypes. + Uses pure python method for object dtype, and numpy method for + all other dtypes. + The numpy method has the limitation that the `uniques` need to + be sorted. Importantly, this is not checked but assumed to already be + the case. The calling method needs to ensure this for all non-object + values. Parameters ---------- @@ -127,6 +150,9 @@ def _encode(values, uniques=None, encode=False, check_unknown=True, True in this case. This parameter is useful for _BaseEncoder._transform() to avoid calling _encode_check_unknown() twice. + allow_nan : bool, default False + if True, encode np.nan as another category. Otherwise raise an error + if nan are present Returns ------- @@ -148,7 +174,7 @@ def _encode(values, uniques=None, encode=False, check_unknown=True, check_unknown, allow_nan) -def _encode_check_unknown(values, uniques, return_mask=False): +def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): """ Helper function to check for unknowns in values to be encoded. @@ -176,13 +202,16 @@ def _encode_check_unknown(values, uniques, return_mask=False): """ if values.dtype == object: uniques_set = set(uniques) - diff = list(set(values) - uniques_set) + diff = np.array(list(set(values) - uniques_set)) # set([np.nan]) - set([np.nan]) returns set() # but set(np.array([np.nan])) - set(np.array([np.nan])) return {nan} if diff and any(_get_mask(diff, np.nan)): - if any(_get_mask(uniques_set, np.nan)) and\ - any(_get_mask(set(values), np.nan)): - diff = diff[~_get_mask(diff, np.nan)] + if not allow_nan: + raise ValueError('Nan found during check_unknown') + else: + if any(_get_mask(uniques_set, np.nan)) and\ + any(_get_mask(set(values), np.nan)): + diff = diff[~_get_mask(diff, np.nan)] if return_mask: if diff: valid_mask = np.array([val in uniques_set for val in values]) @@ -196,9 +225,12 @@ def _encode_check_unknown(values, uniques, return_mask=False): diff = np.setdiff1d(unique_values, uniques, assume_unique=True) # np.setdiff1d([np.nan],[np.nan]) returns [np.nan] if any(_get_mask(diff, np.nan)): - if any(_get_mask(unique_values, np.nan)) and\ - any(_get_mask(uniques, np.nan)): - diff = [x for x in diff if not is_scalar_nan(x)] + if not allow_nan: + raise ValueError('Nan found during check_unknown') + else: + if any(_get_mask(unique_values, np.nan)) and\ + any(_get_mask(uniques, np.nan)): + diff = [x for x in diff if not is_scalar_nan(x)] if return_mask: if diff: valid_mask = np.in1d(values, uniques) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index d9c5092affd2d..29cd6602e4f10 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -8,8 +8,14 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_allclose +from sklearn.utils.testing import ignore_warnings +from sklearn.utils.testing import assert_warns +from sklearn.utils.testing import assert_warns_message +from sklearn.utils.testing import assert_no_warnings from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import OrdinalEncoder @@ -21,37 +27,228 @@ def toarray(a): return a -def test_one_hot_encoder_sparse_dense(): - # check that sparse and dense will give the same results +def test_one_hot_encoder_sparse(): + # Test OneHotEncoder's fit and transform. + X = [[3, 2, 1], [0, 1, 1]] + enc = OneHotEncoder() + with ignore_warnings(category=(DeprecationWarning, FutureWarning)): + # discover max values automatically + X_trans = enc.fit_transform(X).toarray() + assert_equal(X_trans.shape, (2, 5)) + assert_array_equal(enc.active_features_, + np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) + assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) + + # check outcome + assert_array_equal(X_trans, + [[0., 1., 0., 1., 1.], + [1., 0., 1., 0., 1.]]) + + # max value given as 3 + # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=4) + enc = OneHotEncoder(n_values=4) + with ignore_warnings(category=DeprecationWarning): + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (2, 4 * 3)) + assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) + + # max value given per feature + # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=[3, 2, 2]) + enc = OneHotEncoder(n_values=[3, 2, 2]) + with ignore_warnings(category=DeprecationWarning): + X = [[1, 0, 1], [0, 1, 1]] + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (2, 3 + 2 + 2)) + assert_array_equal(enc.n_values_, [3, 2, 2]) + # check that testing with larger feature works: + X = np.array([[2, 0, 1], [0, 1, 1]]) + enc.transform(X) + + # test that an error is raised when out of bounds: + X_too_large = [[0, 2, 1], [0, 1, 1]] + assert_raises(ValueError, enc.transform, X_too_large) + error_msg = r"unknown categorical feature present \[2\] during transform" + assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) + with ignore_warnings(category=DeprecationWarning): + assert_raises( + ValueError, + OneHotEncoder(n_values=2).fit_transform, X) + + # test that error is raised when wrong number of features + assert_raises(ValueError, enc.transform, X[:, :-1]) + + # test that error is raised when wrong number of features in fit + # with prespecified n_values + with ignore_warnings(category=DeprecationWarning): + assert_raises(ValueError, enc.fit, X[:, :-1]) + # test exception on wrong init param + with ignore_warnings(category=DeprecationWarning): + assert_raises( + TypeError, OneHotEncoder(n_values=np.int).fit, X) - X = np.array([[3, 2, 1], [0, 1, 1]]) - enc_sparse = OneHotEncoder() - enc_dense = OneHotEncoder(sparse=False) + enc = OneHotEncoder() + # test negative input to fit + with ignore_warnings(category=FutureWarning): + assert_raises(ValueError, enc.fit, [[0], [-1]]) + + # test negative input to transform + with ignore_warnings(category=FutureWarning): + enc.fit([[0], [1]]) + assert_raises(ValueError, enc.transform, [[0], [-1]]) + + with ignore_warnings(category=(DeprecationWarning, FutureWarning)): + enc = OneHotEncoder(drop='first', n_values=1) + for method in (enc.fit, enc.fit_transform): + assert_raises_regex( + ValueError, + 'The `categorical_features` and `n_values` keywords ', + method, [[0], [-1]]) + + enc = OneHotEncoder(drop='first', categorical_features='all') + assert_raises_regex( + ValueError, + 'The `categorical_features` and `n_values` keywords ', + method, [[0], [-1]]) + + +def test_one_hot_encoder_dense(): + # check for sparse=False + X = [[3, 2, 1], [0, 1, 1]] + enc = OneHotEncoder(sparse=False) + with ignore_warnings(category=(DeprecationWarning, FutureWarning)): + # discover max values automatically + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (2, 5)) + assert_array_equal(enc.active_features_, + np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) + assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) - X_trans_sparse = enc_sparse.fit_transform(X) - X_trans_dense = enc_dense.fit_transform(X) + # check outcome + assert_array_equal(X_trans, + np.array([[0., 1., 0., 1., 1.], + [1., 0., 1., 0., 1.]])) + + +def test_one_hot_encoder_deprecationwarnings(): + for X in [[[3, 2, 1], [0, 1, 1]], + [[3., 2., 1.], [0., 1., 1.]]]: + enc = OneHotEncoder() + assert_warns_message(FutureWarning, "handling of integer", + enc.fit, X) + enc = OneHotEncoder() + assert_warns_message(FutureWarning, "handling of integer", + enc.fit_transform, X) + + # check it still works correctly as well + with ignore_warnings(category=FutureWarning): + X_trans = enc.fit_transform(X).toarray() + res = [[0., 1., 0., 1., 1.], + [1., 0., 1., 0., 1.]] + assert_array_equal(X_trans, res) + + # check deprecated attributes + assert_warns(DeprecationWarning, lambda: enc.active_features_) + assert_warns(DeprecationWarning, lambda: enc.feature_indices_) + assert_warns(DeprecationWarning, lambda: enc.n_values_) + + # check no warning is raised if keyword is specified + enc = OneHotEncoder(categories='auto') + assert_no_warnings(enc.fit, X) + enc = OneHotEncoder(categories='auto') + assert_no_warnings(enc.fit_transform, X) + X_trans = enc.fit_transform(X).toarray() + assert_array_equal(X_trans, res) - assert X_trans_sparse.shape == (2, 5) - assert X_trans_dense.shape == (2, 5) + # check there is also a warning if the default is passed + enc = OneHotEncoder(n_values='auto', handle_unknown='ignore') + assert_warns(DeprecationWarning, enc.fit, X) - assert sparse.issparse(X_trans_sparse) - assert not sparse.issparse(X_trans_dense) + X = np.array([['cat1', 'cat2']], dtype=object).T + enc = OneHotEncoder(categorical_features='all') + assert_warns(DeprecationWarning, enc.fit, X) - # check outcome - assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.], - [1., 0., 1., 0., 1.]]) - assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) +def test_one_hot_encoder_force_new_behaviour(): + # ambiguous integer case (non secutive range of categories) + X = np.array([[1, 2]]).T + X2 = np.array([[0, 1]]).T -def test_one_hot_encoder_diff_n_features(): - X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) - X2 = np.array([[1, 0]]) + # without argument -> by default using legacy behaviour with warnings enc = OneHotEncoder() + + with ignore_warnings(category=FutureWarning): + enc.fit(X) + + res = enc.transform(X2) + exp = np.array([[0, 0], [1, 0]]) + assert_array_equal(res.toarray(), exp) + + # with explicit auto argument -> don't use legacy behaviour + # (so will raise an error on unseen value within range) + enc = OneHotEncoder(categories='auto') enc.fit(X) - err_msg = ("The number of features in X is different to the number of " - "features of the fitted data.") - with pytest.raises(ValueError, match=err_msg): - enc.transform(X2) + assert_raises(ValueError, enc.transform, X2) + + +def _run_one_hot(X, X2, cat): + # enc = assert_warns( + # DeprecationWarning, + # OneHotEncoder, categorical_features=cat) + enc = OneHotEncoder(categorical_features=cat) + with ignore_warnings(category=(DeprecationWarning, FutureWarning)): + Xtr = enc.fit_transform(X) + with ignore_warnings(category=(DeprecationWarning, FutureWarning)): + X2tr = enc.fit(X).transform(X2) + return Xtr, X2tr + + +def _check_one_hot(X, X2, cat, n_features): + ind = np.where(cat)[0] + # With mask + A, B = _run_one_hot(X, X2, cat) + # With indices + C, D = _run_one_hot(X, X2, ind) + # Check shape + assert_equal(A.shape, (2, n_features)) + assert_equal(B.shape, (1, n_features)) + assert_equal(C.shape, (2, n_features)) + assert_equal(D.shape, (1, n_features)) + # Check that mask and indices give the same results + assert_array_equal(toarray(A), toarray(C)) + assert_array_equal(toarray(B), toarray(D)) + + +def test_one_hot_encoder_categorical_features(): + X = np.array([[3, 2, 1], [0, 1, 1]]) + X2 = np.array([[1, 1, 1]]) + + cat = [True, False, False] + _check_one_hot(X, X2, cat, 4) + + # Edge case: all non-categorical + cat = [False, False, False] + _check_one_hot(X, X2, cat, 3) + + # Edge case: all categorical + cat = [True, True, True] + _check_one_hot(X, X2, cat, 5) + + # check error raised if also specifying categories + oh = OneHotEncoder(categories=[range(3)], + categorical_features=[True, False, False]) + assert_raises(ValueError, oh.fit, X) + + +def test_one_hot_encoder_categorical_features_ignore_unknown(): + # GH12881 bug in combination of categorical_features with ignore + X = np.array([[1, 2, 3], [4, 5, 6], [2, 3, 2]]).T + oh = OneHotEncoder(categorical_features=[2], handle_unknown='ignore') + + with ignore_warnings(category=DeprecationWarning): + res = oh.fit_transform(X) + + expected = np.array([[1, 0, 1], [0, 1, 0], [1, 2, 3], [4, 5, 6]]).T + assert_array_equal(res.toarray(), expected) def test_one_hot_encoder_handle_unknown(): @@ -61,9 +258,8 @@ def test_one_hot_encoder_handle_unknown(): # Test that one hot encoder raises error for unknown features # present during transform. oh = OneHotEncoder(handle_unknown='error') - oh.fit(X) - with pytest.raises(ValueError, match='Found unknown categories'): - oh.transform(X2) + assert_warns(FutureWarning, oh.fit, X) + assert_raises(ValueError, oh.transform, X2) # Test the ignore option, ignores unknown features (giving all 0's) oh = OneHotEncoder(handle_unknown='ignore') @@ -77,8 +273,7 @@ def test_one_hot_encoder_handle_unknown(): # Raise error if handle_unknown is neither ignore or error. oh = OneHotEncoder(handle_unknown='42') - with pytest.raises(ValueError, match='handle_unknown should be either'): - oh.fit(X) + assert_raises(ValueError, oh.fit, X) def test_one_hot_encoder_not_fitted(): @@ -90,6 +285,19 @@ def test_one_hot_encoder_not_fitted(): enc.transform(X) +def test_one_hot_encoder_no_categorical_features(): + X = np.array([[3, 2, 1], [0, 1, 1]], dtype='float64') + + cat = [False, False, False] + enc = OneHotEncoder(categorical_features=cat) + with ignore_warnings(category=(DeprecationWarning, FutureWarning)): + X_tr = enc.fit_transform(X) + expected_features = np.array([], dtype='object') + assert_array_equal(X, X_tr) + assert_array_equal(enc.get_feature_names(), expected_features) + assert enc.categories_ == [] + + def test_one_hot_encoder_handle_unknown_strings(): X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1)) X2 = np.array(['55555', '22']).reshape((-1, 1)) @@ -138,47 +346,6 @@ def test_one_hot_encoder_dtype_pandas(output_dtype): assert_array_equal(oh.fit(X_df).transform(X_df), X_expected) -def test_one_hot_encoder_feature_names(): - enc = OneHotEncoder() - X = [['Male', 1, 'girl', 2, 3], - ['Female', 41, 'girl', 1, 10], - ['Male', 51, 'boy', 12, 3], - ['Male', 91, 'girl', 21, 30]] - - enc.fit(X) - feature_names = enc.get_feature_names() - assert isinstance(feature_names, np.ndarray) - - assert_array_equal(['x0_Female', 'x0_Male', - 'x1_1', 'x1_41', 'x1_51', 'x1_91', - 'x2_boy', 'x2_girl', - 'x3_1', 'x3_2', 'x3_12', 'x3_21', - 'x4_3', - 'x4_10', 'x4_30'], feature_names) - - feature_names2 = enc.get_feature_names(['one', 'two', - 'three', 'four', 'five']) - - assert_array_equal(['one_Female', 'one_Male', - 'two_1', 'two_41', 'two_51', 'two_91', - 'three_boy', 'three_girl', - 'four_1', 'four_2', 'four_12', 'four_21', - 'five_3', 'five_10', 'five_30'], feature_names2) - - with pytest.raises(ValueError, match="input_features should have length"): - enc.get_feature_names(['one', 'two']) - - -def test_one_hot_encoder_feature_names_unicode(): - enc = OneHotEncoder() - X = np.array([['c❤t1', 'dat2']], dtype=object).T - enc.fit(X) - feature_names = enc.get_feature_names() - assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names) - feature_names = enc.get_feature_names(input_features=['n👍me']) - assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names) - - def test_one_hot_encoder_set_params(): X = np.array([[1, 2]]).T oh = OneHotEncoder() @@ -220,31 +387,6 @@ def test_one_hot_encoder(X): assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]]) -@pytest.mark.parametrize("X", [ - [['def', 1, np.nan], ['abc', 2, np.nan]], - np.array([[10, 1, np.nan], [5, 2, np.nan]]), - np.array([['b', 'A', np.nan], ['a', 'B', np.nan]], dtype=object) - ], ids=['mixed', 'numeric', 'object']) -def test_one_hot_encoder_with_nan(X): - Xtr = check_categorical_onehot(np.array(X)[:, [0]]) - assert_allclose(Xtr, [[0, 1], [1, 0]]) - - Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) - assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]]) - - Xtr = OneHotEncoder(categories='auto').fit_transform(X) - assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]]) - -def test_ohe_handle_unknow_sparse_nan(): - # TODO - pass - - -def test_ohe_handle_unknow_nan(): - # TODO - pass - - @pytest.mark.parametrize('sparse_', [False, True]) @pytest.mark.parametrize('drop', [None, 'first']) def test_one_hot_encoder_inverse(sparse_, drop): @@ -469,14 +611,18 @@ def test_one_hot_encoder_feature_names_drop(drop, expected_names): @pytest.mark.parametrize("as_data_frame", [False, True], ids=['array', 'dataframe']) @pytest.mark.parametrize("handle_unknown", ['error', 'ignore']) -def test_one_hot_encoder_accept_nan(X, as_data_frame, handle_unknown): +def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown): if as_data_frame: pd = pytest.importorskip('pandas') X = pd.DataFrame(X) ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown) - ohe.fit(X) - ohe.fit_transform(X) + + with pytest.raises(ValueError, match="Input contains NaN"): + ohe.fit(X) + + with pytest.raises(ValueError, match="Input contains NaN"): + ohe.fit_transform(X) if as_data_frame: X_partial = X.iloc[:1, :] @@ -484,7 +630,9 @@ def test_one_hot_encoder_accept_nan(X, as_data_frame, handle_unknown): X_partial = X[:1, :] ohe.fit(X_partial) - ohe.transform(X) + + with pytest.raises(ValueError, match="Input contains NaN"): + ohe.transform(X) @pytest.mark.parametrize("X", [ @@ -542,6 +690,24 @@ def test_ordinal_encoder_inverse(): assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) +@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, + np.array([['a', np.nan]], dtype=object).T], + ids=['numeric', 'object']) +def test_ordinal_encoder_raise_missing(X): + ohe = OrdinalEncoder() + + with pytest.raises(ValueError, match="Input contains NaN"): + ohe.fit(X) + + with pytest.raises(ValueError, match="Input contains NaN"): + ohe.fit_transform(X) + + ohe.fit(X[:1, :]) + + with pytest.raises(ValueError, match="Input contains NaN"): + ohe.transform(X) + + def test_ordinal_encoder_raise_categories_shape(): X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T @@ -591,7 +757,7 @@ def test_encoder_dtypes_pandas(): assert_array_equal(enc.transform(X).toarray(), exp) X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]}) - X_type = [X['A'].dtype, X['B'].dtype, X['C'].dtype] + X_type = [int, object, float] enc.fit(X) assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)]) assert_array_equal(enc.transform(X).toarray(), exp) @@ -622,23 +788,39 @@ def test_one_hot_encoder_drop_manual(): enc.inverse_transform(trans)) -@pytest.mark.parametrize( - "X_fit, params, err_msg", - [([["Male"], ["Female"]], {'drop': 'second'}, - "Wrong input for parameter `drop`"), - ([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'}, - "`handle_unknown` must be 'error'"), - ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]], - {'drop': np.asarray('b', dtype=object)}, - "Wrong input for parameter `drop`"), - ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]], - {'drop': ['ghi', 3, 59]}, - "The following categories were supposed")] -) -def test_one_hot_encoder_invalid_params(X_fit, params, err_msg): - enc = OneHotEncoder(**params) - with pytest.raises(ValueError, match=err_msg): - enc.fit(X_fit) +def test_one_hot_encoder_invalid_params(): + enc = OneHotEncoder(drop='second') + assert_raises_regex( + ValueError, + "Wrong input for parameter `drop`.", + enc.fit, [["Male"], ["Female"]]) + + enc = OneHotEncoder(handle_unknown='ignore', drop='first') + assert_raises_regex( + ValueError, + "`handle_unknown` must be 'error'", + enc.fit, [["Male"], ["Female"]]) + + enc = OneHotEncoder(drop='first') + assert_raises_regex( + ValueError, + "The handling of integer data will change in version", + enc.fit, [[1], [2]]) + + enc = OneHotEncoder(drop='first', categories='auto') + assert_no_warnings(enc.fit_transform, [[1], [2]]) + + enc = OneHotEncoder(drop=np.asarray('b', dtype=object)) + assert_raises_regex( + ValueError, + "Wrong input for parameter `drop`.", + enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]]) + + enc = OneHotEncoder(drop=['ghi', 3, 59]) + assert_raises_regex( + ValueError, + "The following categories were supposed", + enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]]) @pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']]) @@ -672,8 +854,3 @@ def test_categories(density, drop): assert cat_list[drop_idx] == drop_cat assert isinstance(ohe_test.drop_idx_, np.ndarray) assert ohe_test.drop_idx_.dtype == np.int_ - - -@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) -def test_encoders_has_categorical_tags(Encoder): - assert 'categorical' in Encoder()._get_tags()['X_types'] diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index de5c4a7191a91..c141cd12b97fc 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -208,20 +208,30 @@ def test_label_encoder_negative_ints(): def test_label_encode_with_nan(): - assert len(_encode(np.asarray([np.nan, np.nan], dtype=float), allow_nan=True)) == 1 - assert len(_encode(np.asarray([np.nan, np.nan], dtype=object), allow_nan=True)) == 1 - assert len(_encode(np.asarray([4, np.nan]), allow_nan=True)) == 2 + # encode all nan within one category + assert len(_encode(np.asarray([np.nan, np.nan], dtype=float), + allow_nan=True)) == 1 + assert len(_encode(np.asarray([np.nan, np.nan], dtype=object), + allow_nan=True)) == 1 + assert len(_encode(np.asarray([4, np.nan, np.nan]), allow_nan=True)) == 2 + + # the encoded size corresponds to the values size assert len(_encode(np.asarray([np.nan, np.nan], dtype=float), encode=True, allow_nan=True)[1]) == 2 assert len(_encode(np.asarray([np.nan, np.nan], dtype=object), encode=True, allow_nan=True)[1]) == 2 - assert len(_encode(np.asarray([4, np.nan, np.nan, np.nan]), - encode=True, allow_nan=True)[1]) == 4 + + encoded = _encode(np.asarray([4, 5, np.nan, np.nan, np.nan]), + encode=True, allow_nan=True)[1] + assert_array_equal(encoded, [0, 1, 2, 2, 2]) -def test_label_encode_with_mixed_type(): - assert len(_encode(np.asarray([4, 'm']))) == 2 +@pytest.mark.parametrize("values", + [np.asarray([np.nan, np.nan], dtype=float), + np.asarray([np.nan, np.nan], dtype=object)]) +def test_label_encode_raise_nan(values): + assert_raises(ValueError, _encode, values, allow_nan=False) @pytest.mark.parametrize("dtype", ['str', 'object']) @@ -623,7 +633,10 @@ def test_encode_util(values, expected): assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) -def test_encode_check_unknown(): +@pytest.mark.parametrize( + "allow_nan", + [True, False]) +def test_encode_check_unknown(allow_nan): # test for the check_unknown parameter of _encode() uniques = np.array([1, 2, 3]) values = np.array([1, 2, 3, 4]) @@ -631,14 +644,50 @@ def test_encode_check_unknown(): # Default is True, raise error with pytest.raises(ValueError, match='y contains previously unseen labels'): - _encode(values, uniques, encode=True, check_unknown=True) + _encode(values, uniques, encode=True, check_unknown=True, + allow_nan=allow_nan) # dont raise error if False - _encode(values, uniques, encode=True, check_unknown=False) + _encode(values, uniques, encode=True, check_unknown=False, + allow_nan=allow_nan) # parameter is ignored for object dtype uniques = np.array(['a', 'b', 'c'], dtype=object) values = np.array(['a', 'b', 'c', 'd'], dtype=object) with pytest.raises(ValueError, match='y contains previously unseen labels'): - _encode(values, uniques, encode=True, check_unknown=False) + _encode(values, uniques, encode=True, check_unknown=False, + allow_nan=allow_nan) + + +@pytest.mark.parametrize( + "uniques, values", + [(np.array([1, 2, 3]), + np.array([1, 2, 3, np.nan])), + (np.array([np.nan, 2, 3]), + np.array([np.nan, 2, 3, 4]))]) +def test_encode_check_unknown_nan_float(uniques, values): + # test for the check_unknown parameter of _encode() with nan present + + with pytest.raises(ValueError, + match='y contains previously unseen label'): + _encode(values, uniques, encode=True, check_unknown=True, + allow_nan=True) + + # dont raise error if False + _encode(values, uniques, encode=True, check_unknown=False, allow_nan=True) + + +@pytest.mark.parametrize( + "uniques, values", + [(np.array(['a', 'b', 'c'], dtype=object), + np.array(['a', 'b', 'c', np.nan], dtype=object)), + (np.array([np.nan, 'b', 'c'], dtype=object), + np.array([np.nan, 'b', 'c', 'd'], dtype=object))]) +def test_encode_check_unknown_nan_object(uniques, values): + # test for the check_unknown parameter of _encode() with nan present + # parameter check_unknown is ignored for object dtype + with pytest.raises(ValueError, + match='y contains previously unseen label'): + _encode(values, uniques, encode=True, check_unknown=True, + allow_nan=True) From 2053fb2bb73c95525874d90e192bcb4cbfccb386 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 18 Sep 2019 11:08:39 +0200 Subject: [PATCH 04/16] iter (clean _encoders.py) --- sklearn/preprocessing/_encoders.py | 534 +++---------------- sklearn/preprocessing/label.py | 4 +- sklearn/preprocessing/tests/test_encoders.py | 375 ++++--------- 3 files changed, 173 insertions(+), 740 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index c1acfbe799485..ac03659d3ef23 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -2,20 +2,14 @@ # Joris Van den Bossche # License: BSD 3 clause -import numbers -import warnings - import numpy as np from scipy import sparse -from .. import get_config as _get_config from ..base import BaseEstimator, TransformerMixin from ..utils import check_array -from ..utils import deprecated -from ..utils.fixes import _argmax, _object_dtype_isnan +from ..utils.fixes import _argmax from ..utils.validation import check_is_fitted -from .base import _transform_selected from .label import _encode, _encode_check_unknown @@ -25,11 +19,10 @@ ] -class _BaseEncoder(BaseEstimator, TransformerMixin): +class _BaseEncoder(TransformerMixin, BaseEstimator): """ Base class for encoders that includes the code to categorize and transform the input features. - """ def _check_X(self, X): @@ -42,7 +35,6 @@ def _check_X(self, X): constructed feature by feature to preserve the data types of pandas DataFrame columns, as otherwise information is lost and cannot be used, eg for the `categories_` attribute. - """ if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2): # if not a dataframe, do normal check_array validation @@ -79,8 +71,8 @@ def _get_feature(self, X, feature_idx): def _fit(self, X, handle_unknown='error'): X_list, n_samples, n_features = self._check_X(X) - if self._categories != 'auto': - if len(self._categories) != n_features: + if self.categories != 'auto': + if len(self.categories) != n_features: raise ValueError("Shape mismatch: if categories is an array," " it has to be of shape (n_features,).") @@ -88,10 +80,10 @@ def _fit(self, X, handle_unknown='error'): for i in range(n_features): Xi = X_list[i] - if self._categories == 'auto': + if self.categories == 'auto': cats = _encode(Xi) else: - cats = np.array(self._categories[i], dtype=Xi.dtype) + cats = np.array(self.categories[i], dtype=Xi.dtype) if Xi.dtype != object: if not np.all(np.sort(cats) == cats): raise ValueError("Unsorted categories are not " @@ -110,6 +102,14 @@ def _transform(self, X, handle_unknown='error'): X_int = np.zeros((n_samples, n_features), dtype=np.int) X_mask = np.ones((n_samples, n_features), dtype=np.bool) + if n_features != len(self.categories_): + raise ValueError( + "The number of features in X is different to the number of " + "features of the fitted data. The fitted data had {} features " + "and the X has {} features." + .format(len(self.categories_,), n_features) + ) + for i in range(n_features): Xi = X_list[i] diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], @@ -134,66 +134,58 @@ def _transform(self, X, handle_unknown='error'): Xi = Xi.copy() Xi[~valid_mask] = self.categories_[i][0] - _, encoded = _encode(Xi, self.categories_[i], encode=True) + # We use check_unknown=False, since _encode_check_unknown was + # already called above. + _, encoded = _encode(Xi, self.categories_[i], encode=True, + check_unknown=False) X_int[:, i] = encoded return X_int, X_mask + def _more_tags(self): + return {'X_types': ['categorical']} -class OneHotEncoder(_BaseEncoder): - """Encode categorical integer features as a one-hot numeric array. +class OneHotEncoder(_BaseEncoder): + """Encode categorical features as a one-hot numeric array. The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') encoding scheme. This creates a binary column for each category and - returns a sparse matrix or dense array. - + returns a sparse matrix or dense array (depending on the ``sparse`` + parameter) By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the `categories` manually. - The OneHotEncoder previously assumed that the input features take on - values in the range [0, max(values)). This behaviour is deprecated. - This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. - Note: a one-hot encoding of y labels should use a LabelBinarizer instead. - Read more in the :ref:`User Guide `. - Parameters ---------- categories : 'auto' or a list of lists/arrays of values, default='auto'. Categories (unique values) per feature: - - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values within a single feature, and should be sorted in case of numeric values. - The used categories can be found in the ``categories_`` attribute. - drop : 'first' or a list/array of shape (n_features,), default=None. Specifies a methodology to use to drop one of the categories per feature. This is useful in situations where perfectly collinear features cause problems, such as when feeding the resulting data into a neural network or an unregularized regression. - - None : retain all features (the default). - 'first' : drop the first category in each feature. If only one category is present, the feature will be dropped entirely. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that should be dropped. - sparse : boolean, default=True Will return sparse matrix if set True else will return an array. - dtype : number type, default=np.float Desired dtype of output. - handle_unknown : 'error' or 'ignore', default='error'. Whether to raise an error or ignore if an unknown categorical feature is present during transform (default is to raise). When this parameter @@ -201,35 +193,6 @@ class OneHotEncoder(_BaseEncoder): transform, the resulting one-hot encoded columns for this feature will be all zeros. In the inverse transform, an unknown category will be denoted as None. - - n_values : 'auto', int or array of ints, default='auto' - Number of values per feature. - - - 'auto' : determine value range from training data. - - int : number of categorical values per feature. - Each feature value should be in ``range(n_values)`` - - array : ``n_values[i]`` is the number of categorical values in - ``X[:, i]``. Each feature value should be - in ``range(n_values[i])`` - - .. deprecated:: 0.20 - The `n_values` keyword was deprecated in version 0.20 and will - be removed in 0.22. Use `categories` instead. - - categorical_features : 'all' or array of indices or mask, default='all' - Specify what features are treated as categorical. - - - 'all': All features are treated as categorical. - - array of indices: Array of categorical feature indices. - - mask: Array of length n_features and with dtype=bool. - - Non-categorical features are always stacked to the right of the matrix. - - .. deprecated:: 0.20 - The `categorical_features` keyword was deprecated in version - 0.20 and will be removed in 0.22. - You can use the ``ColumnTransformer`` instead. - Attributes ---------- categories_ : list of arrays @@ -237,52 +200,19 @@ class OneHotEncoder(_BaseEncoder): (in order of the features in X and corresponding with the output of ``transform``). This includes the category specified in ``drop`` (if any). - drop_idx_ : array of shape (n_features,) - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to + ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to be dropped for each feature. None if all the transformed features will be retained. - - active_features_ : array - Indices for active features, meaning values that actually occur - in the training set. Only available when n_values is ``'auto'``. - - .. deprecated:: 0.20 - The ``active_features_`` attribute was deprecated in version - 0.20 and will be removed in 0.22. - - feature_indices_ : array of shape (n_features,) - Indices to feature ranges. - Feature ``i`` in the original data is mapped to features - from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - (and then potentially masked by ``active_features_`` afterwards) - - .. deprecated:: 0.20 - The ``feature_indices_`` attribute was deprecated in version - 0.20 and will be removed in 0.22. - - n_values_ : array of shape (n_features,) - Maximum number of values per feature. - - .. deprecated:: 0.20 - The ``n_values_`` attribute was deprecated in version - 0.20 and will be removed in 0.22. - Examples -------- Given a dataset with two features, we let the encoder find the unique values per feature and transform the data to a binary one-hot encoding. - >>> from sklearn.preprocessing import OneHotEncoder >>> enc = OneHotEncoder(handle_unknown='ignore') >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) - ... # doctest: +ELLIPSIS - ... # doctest: +NORMALIZE_WHITESPACE - OneHotEncoder(categorical_features=None, categories=None, drop=None, - dtype=<... 'numpy.float64'>, handle_unknown='ignore', - n_values=None, sparse=True) - + OneHotEncoder(handle_unknown='ignore') >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 1], ['Male', 4]]).toarray() @@ -299,7 +229,6 @@ class OneHotEncoder(_BaseEncoder): >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray() array([[0., 0., 0.], [1., 1., 0.]]) - See also -------- sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) @@ -315,184 +244,27 @@ class OneHotEncoder(_BaseEncoder): matrix indicating the presence of a class label. """ - def __init__(self, n_values=None, categorical_features=None, - categories=None, drop=None, sparse=True, dtype=np.float64, - handle_unknown='error'): + def __init__(self, categories='auto', drop=None, sparse=True, + dtype=np.float64, handle_unknown='error'): self.categories = categories self.sparse = sparse self.dtype = dtype self.handle_unknown = handle_unknown - self.n_values = n_values - self.categorical_features = categorical_features self.drop = drop - # Deprecated attributes - - @deprecated("The ``active_features_`` attribute was deprecated in version " - "0.20 and will be removed 0.22.") - @property - def active_features_(self): - check_is_fitted(self, 'categories_') - return self._active_features_ - - @deprecated("The ``feature_indices_`` attribute was deprecated in version " - "0.20 and will be removed 0.22.") - @property - def feature_indices_(self): - check_is_fitted(self, 'categories_') - return self._feature_indices_ - - @deprecated("The ``n_values_`` attribute was deprecated in version " - "0.20 and will be removed 0.22.") - @property - def n_values_(self): - check_is_fitted(self, 'categories_') - return self._n_values_ - - def _handle_deprecations(self, X): - # internal version of the attributes to handle deprecations - self._n_values = self.n_values - self._categories = getattr(self, '_categories', None) - self._categorical_features = getattr(self, '_categorical_features', - None) - - # user manually set the categories or second fit -> never legacy mode - if self.categories is not None or self._categories is not None: - self._legacy_mode = False - if self.categories is not None: - self._categories = self.categories - - # categories not set -> infer if we need legacy mode or not - elif self.n_values is not None and self.n_values != 'auto': - msg = ( - "Passing 'n_values' is deprecated in version 0.20 and will be " - "removed in 0.22. You can use the 'categories' keyword " - "instead. 'n_values=n' corresponds to " - "'categories=[range(n)] * n_features'." - ) - warnings.warn(msg, DeprecationWarning) - self._legacy_mode = True - - else: # n_values = 'auto' - # n_values can also be None (default to catch usage), so set - # _n_values to 'auto' explicitly - self._n_values = 'auto' - if self.handle_unknown == 'ignore': - # no change in behaviour, no need to raise deprecation warning - self._legacy_mode = False - self._categories = 'auto' - if self.n_values == 'auto': - # user manually specified this - msg = ( - "Passing 'n_values' is deprecated in version 0.20 and " - "will be removed in 0.22. n_values='auto' can be " - "replaced with categories='auto'." - ) - warnings.warn(msg, DeprecationWarning) - else: - # check if we have integer or categorical input - try: - check_array(X, dtype=np.int) - except ValueError: - self._legacy_mode = False - self._categories = 'auto' - else: - if self.drop is None: - msg = ( - "The handling of integer data will change in " - "version 0.22. Currently, the categories are " - "determined based on the range " - "[0, max(values)], while in the future they " - "will be determined based on the unique " - "values.\nIf you want the future behaviour " - "and silence this warning, you can specify " - "\"categories='auto'\".\n" - "In case you used a LabelEncoder before this " - "OneHotEncoder to convert the categories to " - "integers, then you can now use the " - "OneHotEncoder directly." - ) - warnings.warn(msg, FutureWarning) - self._legacy_mode = True - else: - msg = ( - "The handling of integer data will change in " - "version 0.22. Currently, the categories are " - "determined based on the range " - "[0, max(values)], while in the future they " - "will be determined based on the unique " - "values.\n The old behavior is not compatible " - "with the `drop` parameter. Instead, you " - "must manually specify \"categories='auto'\" " - "if you wish to use the `drop` parameter on " - "an array of entirely integer data. This will " - "enable the future behavior." - ) - raise ValueError(msg) - - # if user specified categorical_features -> always use legacy mode - if self.categorical_features is not None: - if (isinstance(self.categorical_features, str) - and self.categorical_features == 'all'): - warnings.warn( - "The 'categorical_features' keyword is deprecated in " - "version 0.20 and will be removed in 0.22. The passed " - "value of 'all' is the default and can simply be removed.", - DeprecationWarning) - else: - if self.categories is not None: - raise ValueError( - "The 'categorical_features' keyword is deprecated, " - "and cannot be used together with specifying " - "'categories'.") - warnings.warn( - "The 'categorical_features' keyword is deprecated in " - "version 0.20 and will be removed in 0.22. You can " - "use the ColumnTransformer instead.", DeprecationWarning) - # Set categories_ to empty list if no categorical columns exist - n_features = X.shape[1] - sel = np.zeros(n_features, dtype=bool) - sel[np.asarray(self.categorical_features)] = True - if sum(sel) == 0: - self.categories_ = [] - self._legacy_mode = True - self._categorical_features = self.categorical_features - else: - self._categorical_features = 'all' - - # Prevents new drop functionality from being used in legacy mode - if self._legacy_mode and self.drop is not None: + def _validate_keywords(self): + if self.handle_unknown not in ('error', 'ignore'): + msg = ("handle_unknown should be either 'error' or 'ignore', " + "got {0}.".format(self.handle_unknown)) + raise ValueError(msg) + # If we have both dropped columns and ignored unknown + # values, there will be ambiguous cells. This creates difficulties + # in interpreting the model. + if self.drop is not None and self.handle_unknown != 'error': raise ValueError( - "The `categorical_features` and `n_values` keywords " - "are deprecated, and cannot be used together " - "with 'drop'.") - - def fit(self, X, y=None): - """Fit OneHotEncoder to X. - - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - The data to determine the categories of each feature. - - Returns - ------- - self - """ - - self._validate_keywords() - - self._handle_deprecations(X) - - if self._legacy_mode: - _transform_selected(X, self._legacy_fit_transform, self.dtype, - self._categorical_features, - copy=True) - return self - else: - self._fit(X, handle_unknown=self.handle_unknown) - self.drop_idx_ = self._compute_drop_idx() - return self + "`handle_unknown` must be 'error' when the drop parameter is " + "specified, as both would create categories that are all " + "zero.") def _compute_drop_idx(self): if self.drop is None: @@ -530,152 +302,48 @@ def _compute_drop_idx(self): "'first', None or array of objects, got {}") raise ValueError(msg.format(type(self.drop))) - def _validate_keywords(self): - if self.handle_unknown not in ('error', 'ignore'): - msg = ("handle_unknown should be either 'error' or 'ignore', " - "got {0}.".format(self.handle_unknown)) - raise ValueError(msg) - # If we have both dropped columns and ignored unknown - # values, there will be ambiguous cells. This creates difficulties - # in interpreting the model. - if self.drop is not None and self.handle_unknown != 'error': - raise ValueError( - "`handle_unknown` must be 'error' when the drop parameter is " - "specified, as both would create categories that are all " - "zero.") - - def _legacy_fit_transform(self, X): - """Assumes X contains only categorical features.""" - dtype = getattr(X, 'dtype', None) - X = check_array(X, dtype=np.int) - if np.any(X < 0): - raise ValueError("OneHotEncoder in legacy mode cannot handle " - "categories encoded as negative integers. " - "Please set categories='auto' explicitly to " - "be able to use arbitrary integer values as " - "category identifiers.") - n_samples, n_features = X.shape - if (isinstance(self._n_values, str) and - self._n_values == 'auto'): - n_values = np.max(X, axis=0) + 1 - elif isinstance(self._n_values, numbers.Integral): - if (np.max(X, axis=0) >= self._n_values).any(): - raise ValueError("Feature out of bounds for n_values=%d" - % self._n_values) - n_values = np.empty(n_features, dtype=np.int) - n_values.fill(self._n_values) - else: - try: - n_values = np.asarray(self._n_values, dtype=int) - except (ValueError, TypeError): - raise TypeError("Wrong type for parameter `n_values`. Expected" - " 'auto', int or array of ints, got %r" - % type(self._n_values)) - if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: - raise ValueError("Shape mismatch: if n_values is an array," - " it has to be of shape (n_features,).") - - self._n_values_ = n_values - self.categories_ = [np.arange(n_val - 1, dtype=dtype) - for n_val in n_values] - n_values = np.hstack([[0], n_values]) - indices = np.cumsum(n_values) - self._feature_indices_ = indices - - column_indices = (X + indices[:-1]).ravel() - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - n_features) - data = np.ones(n_samples * n_features) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() - - if (isinstance(self._n_values, str) and - self._n_values == 'auto'): - mask = np.array(out.sum(axis=0)).ravel() != 0 - active_features = np.where(mask)[0] - out = out[:, active_features] - self._active_features_ = active_features - - self.categories_ = [ - np.unique(X[:, i]).astype(dtype) if dtype - else np.unique(X[:, i]) for i in range(n_features)] - - return out if self.sparse else out.toarray() + def fit(self, X, y=None): + """Fit OneHotEncoder to X. + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + The data to determine the categories of each feature. + Returns + ------- + self + """ + self._validate_keywords() + self._fit(X, handle_unknown=self.handle_unknown) + self.drop_idx_ = self._compute_drop_idx() + return self def fit_transform(self, X, y=None): """Fit OneHotEncoder to X, then transform X. - Equivalent to fit(X).transform(X) but more convenient. - Parameters ---------- X : array-like, shape [n_samples, n_features] The data to encode. - Returns ------- X_out : sparse matrix if sparse=True else a 2-d array Transformed input. """ - self._validate_keywords() + return super().fit_transform(X, y) - self._handle_deprecations(X) - - if self._legacy_mode: - return _transform_selected( - X, self._legacy_fit_transform, self.dtype, - self._categorical_features, copy=True) - else: - return self.fit(X).transform(X) - - def _legacy_transform(self, X): - """Assumes X contains only categorical features.""" - X = check_array(X, dtype=np.int) - if np.any(X < 0): - raise ValueError("OneHotEncoder in legacy mode cannot handle " - "categories encoded as negative integers. " - "Please set categories='auto' explicitly to " - "be able to use arbitrary integer values as " - "category identifiers.") - n_samples, n_features = X.shape - - indices = self._feature_indices_ - if n_features != indices.shape[0] - 1: - raise ValueError("X has different shape than during fitting." - " Expected %d, got %d." - % (indices.shape[0] - 1, n_features)) - - # We use only those categorical features of X that are known using fit. - # i.e lesser than n_values_ using mask. - # This means, if self.handle_unknown is "ignore", the row_indices and - # col_indices corresponding to the unknown categorical feature are - # ignored. - mask = (X < self._n_values_).ravel() - if np.any(~mask): - if self.handle_unknown not in ['error', 'ignore']: - raise ValueError("handle_unknown should be either error or " - "unknown got %s" % self.handle_unknown) - if self.handle_unknown == 'error': - raise ValueError("unknown categorical feature present %s " - "during transform." % X.ravel()[~mask]) - - column_indices = (X + indices[:-1]).ravel()[mask] - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - n_features)[mask] - data = np.ones(np.sum(mask)) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() - if (isinstance(self._n_values, str) and - self._n_values == 'auto'): - out = out[:, self._active_features_] - - return out if self.sparse else out.toarray() - - def _transform_new(self, X): - """New implementation assuming categorical input""" + def transform(self, X): + """Transform X using one-hot encoding. + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + The data to encode. + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array + Transformed input. + """ + check_is_fitted(self) # validation of X happens in _check_X called by _transform X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) @@ -710,48 +378,20 @@ def _transform_new(self, X): else: return out - def transform(self, X): - """Transform X using one-hot encoding. - - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - The data to encode. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. - """ - check_is_fitted(self, 'categories_') - if self._legacy_mode: - return _transform_selected(X, self._legacy_transform, self.dtype, - self._categorical_features, - copy=True) - else: - return self._transform_new(X) - def inverse_transform(self, X): """Convert the back data to the original representation. - In case unknown categories are encountered (all zeros in the one-hot encoding), ``None`` is used to represent this category. - Parameters ---------- X : array-like or sparse matrix, shape [n_samples, n_encoded_features] The transformed data. - Returns ------- X_tr : array-like, shape [n_samples, n_features] Inverse transformed array. - """ - # if self._legacy_mode: - # raise ValueError("only supported for categorical features") - - check_is_fitted(self, 'categories_') + check_is_fitted(self) X = check_array(X, accept_sparse='csr') n_samples, _ = X.shape @@ -822,19 +462,16 @@ def inverse_transform(self, X): def get_feature_names(self, input_features=None): """Return feature names for output features. - Parameters ---------- input_features : list of string, length n_features, optional String names for input features if available. By default, "x0", "x1", ... "xn_features" is used. - Returns ------- output_feature_names : array of string, length n_output_features - """ - check_is_fitted(self, 'categories_') + check_is_fitted(self) cats = self.categories_ if input_features is None: input_features = ['x%d' % i for i in range(len(cats))] @@ -857,57 +494,45 @@ def get_feature_names(self, input_features=None): class OrdinalEncoder(_BaseEncoder): """Encode categorical features as an integer array. - The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are converted to ordinal integers. This results in a single column of integers (0 to n_categories - 1) per feature. - Read more in the :ref:`User Guide `. - Parameters ---------- categories : 'auto' or a list of lists/arrays of values. Categories (unique values) per feature: - - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values, and should be sorted in case of numeric values. - The used categories can be found in the ``categories_`` attribute. - dtype : number type, default np.float64 Desired dtype of output. - Attributes ---------- categories_ : list of arrays The categories of each feature determined during fitting (in order of the features in X and corresponding with the output of ``transform``). - Examples -------- Given a dataset with two features, we let the encoder find the unique values per feature and transform the data to an ordinal encoding. - >>> from sklearn.preprocessing import OrdinalEncoder >>> enc = OrdinalEncoder() >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) - ... # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) + OrdinalEncoder() >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 3], ['Male', 1]]) array([[0., 2.], [1., 0.]]) - >>> enc.inverse_transform([[1, 0], [0, 1]]) array([['Male', 1], ['Female', 2]], dtype=object) - See also -------- sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of @@ -922,56 +547,44 @@ def __init__(self, categories='auto', dtype=np.float64): def fit(self, X, y=None): """Fit the OrdinalEncoder to X. - Parameters ---------- X : array-like, shape [n_samples, n_features] The data to determine the categories of each feature. - Returns ------- self - """ - # base classes uses _categories to deal with deprecations in - # OneHoteEncoder: can be removed once deprecations are removed - self._categories = self.categories self._fit(X) return self def transform(self, X): """Transform X to ordinal codes. - Parameters ---------- X : array-like, shape [n_samples, n_features] The data to encode. - Returns ------- X_out : sparse matrix or a 2-d array Transformed input. - """ X_int, _ = self._transform(X) return X_int.astype(self.dtype, copy=False) def inverse_transform(self, X): """Convert the data back to the original representation. - Parameters ---------- X : array-like or sparse matrix, shape [n_samples, n_encoded_features] The transformed data. - Returns ------- X_tr : array-like, shape [n_samples, n_features] Inverse transformed array. - """ - check_is_fitted(self, 'categories_') + check_is_fitted(self) X = check_array(X, accept_sparse='csr') n_samples, _ = X.shape @@ -991,7 +604,4 @@ def inverse_transform(self, X): labels = X[:, i].astype('int64', copy=False) X_tr[:, i] = self.categories_[i][labels] - return X_tr - - def _more_tags(self): - return {'X_types': ['categorical']} + return X_tr \ No newline at end of file diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 3adf40cc519b2..5e8d39b25cdb7 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -205,7 +205,7 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): diff = np.array(list(set(values) - uniques_set)) # set([np.nan]) - set([np.nan]) returns set() # but set(np.array([np.nan])) - set(np.array([np.nan])) return {nan} - if diff and any(_get_mask(diff, np.nan)): + if len(diff) and any(_get_mask(diff, np.nan)): if not allow_nan: raise ValueError('Nan found during check_unknown') else: @@ -213,7 +213,7 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): any(_get_mask(set(values), np.nan)): diff = diff[~_get_mask(diff, np.nan)] if return_mask: - if diff: + if len(diff): valid_mask = np.array([val in uniques_set for val in values]) else: valid_mask = np.ones(len(values), dtype=bool) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 29cd6602e4f10..4804bd03ed6b8 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -8,14 +8,7 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import ignore_warnings -from sklearn.utils.testing import assert_warns -from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import assert_no_warnings from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import OrdinalEncoder @@ -27,228 +20,37 @@ def toarray(a): return a -def test_one_hot_encoder_sparse(): - # Test OneHotEncoder's fit and transform. - X = [[3, 2, 1], [0, 1, 1]] - enc = OneHotEncoder() - with ignore_warnings(category=(DeprecationWarning, FutureWarning)): - # discover max values automatically - X_trans = enc.fit_transform(X).toarray() - assert_equal(X_trans.shape, (2, 5)) - assert_array_equal(enc.active_features_, - np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) - assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) - - # check outcome - assert_array_equal(X_trans, - [[0., 1., 0., 1., 1.], - [1., 0., 1., 0., 1.]]) - - # max value given as 3 - # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=4) - enc = OneHotEncoder(n_values=4) - with ignore_warnings(category=DeprecationWarning): - X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (2, 4 * 3)) - assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) - - # max value given per feature - # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=[3, 2, 2]) - enc = OneHotEncoder(n_values=[3, 2, 2]) - with ignore_warnings(category=DeprecationWarning): - X = [[1, 0, 1], [0, 1, 1]] - X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (2, 3 + 2 + 2)) - assert_array_equal(enc.n_values_, [3, 2, 2]) - # check that testing with larger feature works: - X = np.array([[2, 0, 1], [0, 1, 1]]) - enc.transform(X) - - # test that an error is raised when out of bounds: - X_too_large = [[0, 2, 1], [0, 1, 1]] - assert_raises(ValueError, enc.transform, X_too_large) - error_msg = r"unknown categorical feature present \[2\] during transform" - assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) - with ignore_warnings(category=DeprecationWarning): - assert_raises( - ValueError, - OneHotEncoder(n_values=2).fit_transform, X) - - # test that error is raised when wrong number of features - assert_raises(ValueError, enc.transform, X[:, :-1]) - - # test that error is raised when wrong number of features in fit - # with prespecified n_values - with ignore_warnings(category=DeprecationWarning): - assert_raises(ValueError, enc.fit, X[:, :-1]) - # test exception on wrong init param - with ignore_warnings(category=DeprecationWarning): - assert_raises( - TypeError, OneHotEncoder(n_values=np.int).fit, X) +def test_one_hot_encoder_sparse_dense(): + # check that sparse and dense will give the same results - enc = OneHotEncoder() - # test negative input to fit - with ignore_warnings(category=FutureWarning): - assert_raises(ValueError, enc.fit, [[0], [-1]]) - - # test negative input to transform - with ignore_warnings(category=FutureWarning): - enc.fit([[0], [1]]) - assert_raises(ValueError, enc.transform, [[0], [-1]]) - - with ignore_warnings(category=(DeprecationWarning, FutureWarning)): - enc = OneHotEncoder(drop='first', n_values=1) - for method in (enc.fit, enc.fit_transform): - assert_raises_regex( - ValueError, - 'The `categorical_features` and `n_values` keywords ', - method, [[0], [-1]]) - - enc = OneHotEncoder(drop='first', categorical_features='all') - assert_raises_regex( - ValueError, - 'The `categorical_features` and `n_values` keywords ', - method, [[0], [-1]]) - - -def test_one_hot_encoder_dense(): - # check for sparse=False - X = [[3, 2, 1], [0, 1, 1]] - enc = OneHotEncoder(sparse=False) - with ignore_warnings(category=(DeprecationWarning, FutureWarning)): - # discover max values automatically - X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (2, 5)) - assert_array_equal(enc.active_features_, - np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) - assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) + X = np.array([[3, 2, 1], [0, 1, 1]]) + enc_sparse = OneHotEncoder() + enc_dense = OneHotEncoder(sparse=False) - # check outcome - assert_array_equal(X_trans, - np.array([[0., 1., 0., 1., 1.], - [1., 0., 1., 0., 1.]])) - - -def test_one_hot_encoder_deprecationwarnings(): - for X in [[[3, 2, 1], [0, 1, 1]], - [[3., 2., 1.], [0., 1., 1.]]]: - enc = OneHotEncoder() - assert_warns_message(FutureWarning, "handling of integer", - enc.fit, X) - enc = OneHotEncoder() - assert_warns_message(FutureWarning, "handling of integer", - enc.fit_transform, X) - - # check it still works correctly as well - with ignore_warnings(category=FutureWarning): - X_trans = enc.fit_transform(X).toarray() - res = [[0., 1., 0., 1., 1.], - [1., 0., 1., 0., 1.]] - assert_array_equal(X_trans, res) - - # check deprecated attributes - assert_warns(DeprecationWarning, lambda: enc.active_features_) - assert_warns(DeprecationWarning, lambda: enc.feature_indices_) - assert_warns(DeprecationWarning, lambda: enc.n_values_) - - # check no warning is raised if keyword is specified - enc = OneHotEncoder(categories='auto') - assert_no_warnings(enc.fit, X) - enc = OneHotEncoder(categories='auto') - assert_no_warnings(enc.fit_transform, X) - X_trans = enc.fit_transform(X).toarray() - assert_array_equal(X_trans, res) + X_trans_sparse = enc_sparse.fit_transform(X) + X_trans_dense = enc_dense.fit_transform(X) - # check there is also a warning if the default is passed - enc = OneHotEncoder(n_values='auto', handle_unknown='ignore') - assert_warns(DeprecationWarning, enc.fit, X) + assert X_trans_sparse.shape == (2, 5) + assert X_trans_dense.shape == (2, 5) - X = np.array([['cat1', 'cat2']], dtype=object).T - enc = OneHotEncoder(categorical_features='all') - assert_warns(DeprecationWarning, enc.fit, X) + assert sparse.issparse(X_trans_sparse) + assert not sparse.issparse(X_trans_dense) + # check outcome + assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.], + [1., 0., 1., 0., 1.]]) + assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) -def test_one_hot_encoder_force_new_behaviour(): - # ambiguous integer case (non secutive range of categories) - X = np.array([[1, 2]]).T - X2 = np.array([[0, 1]]).T - # without argument -> by default using legacy behaviour with warnings +def test_one_hot_encoder_diff_n_features(): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + X2 = np.array([[1, 0]]) enc = OneHotEncoder() - - with ignore_warnings(category=FutureWarning): - enc.fit(X) - - res = enc.transform(X2) - exp = np.array([[0, 0], [1, 0]]) - assert_array_equal(res.toarray(), exp) - - # with explicit auto argument -> don't use legacy behaviour - # (so will raise an error on unseen value within range) - enc = OneHotEncoder(categories='auto') enc.fit(X) - assert_raises(ValueError, enc.transform, X2) - - -def _run_one_hot(X, X2, cat): - # enc = assert_warns( - # DeprecationWarning, - # OneHotEncoder, categorical_features=cat) - enc = OneHotEncoder(categorical_features=cat) - with ignore_warnings(category=(DeprecationWarning, FutureWarning)): - Xtr = enc.fit_transform(X) - with ignore_warnings(category=(DeprecationWarning, FutureWarning)): - X2tr = enc.fit(X).transform(X2) - return Xtr, X2tr - - -def _check_one_hot(X, X2, cat, n_features): - ind = np.where(cat)[0] - # With mask - A, B = _run_one_hot(X, X2, cat) - # With indices - C, D = _run_one_hot(X, X2, ind) - # Check shape - assert_equal(A.shape, (2, n_features)) - assert_equal(B.shape, (1, n_features)) - assert_equal(C.shape, (2, n_features)) - assert_equal(D.shape, (1, n_features)) - # Check that mask and indices give the same results - assert_array_equal(toarray(A), toarray(C)) - assert_array_equal(toarray(B), toarray(D)) - - -def test_one_hot_encoder_categorical_features(): - X = np.array([[3, 2, 1], [0, 1, 1]]) - X2 = np.array([[1, 1, 1]]) - - cat = [True, False, False] - _check_one_hot(X, X2, cat, 4) - - # Edge case: all non-categorical - cat = [False, False, False] - _check_one_hot(X, X2, cat, 3) - - # Edge case: all categorical - cat = [True, True, True] - _check_one_hot(X, X2, cat, 5) - - # check error raised if also specifying categories - oh = OneHotEncoder(categories=[range(3)], - categorical_features=[True, False, False]) - assert_raises(ValueError, oh.fit, X) - - -def test_one_hot_encoder_categorical_features_ignore_unknown(): - # GH12881 bug in combination of categorical_features with ignore - X = np.array([[1, 2, 3], [4, 5, 6], [2, 3, 2]]).T - oh = OneHotEncoder(categorical_features=[2], handle_unknown='ignore') - - with ignore_warnings(category=DeprecationWarning): - res = oh.fit_transform(X) - - expected = np.array([[1, 0, 1], [0, 1, 0], [1, 2, 3], [4, 5, 6]]).T - assert_array_equal(res.toarray(), expected) + err_msg = ("The number of features in X is different to the number of " + "features of the fitted data.") + with pytest.raises(ValueError, match=err_msg): + enc.transform(X2) def test_one_hot_encoder_handle_unknown(): @@ -258,8 +60,9 @@ def test_one_hot_encoder_handle_unknown(): # Test that one hot encoder raises error for unknown features # present during transform. oh = OneHotEncoder(handle_unknown='error') - assert_warns(FutureWarning, oh.fit, X) - assert_raises(ValueError, oh.transform, X2) + oh.fit(X) + with pytest.raises(ValueError, match='Found unknown categories'): + oh.transform(X2) # Test the ignore option, ignores unknown features (giving all 0's) oh = OneHotEncoder(handle_unknown='ignore') @@ -273,7 +76,8 @@ def test_one_hot_encoder_handle_unknown(): # Raise error if handle_unknown is neither ignore or error. oh = OneHotEncoder(handle_unknown='42') - assert_raises(ValueError, oh.fit, X) + with pytest.raises(ValueError, match='handle_unknown should be either'): + oh.fit(X) def test_one_hot_encoder_not_fitted(): @@ -285,19 +89,6 @@ def test_one_hot_encoder_not_fitted(): enc.transform(X) -def test_one_hot_encoder_no_categorical_features(): - X = np.array([[3, 2, 1], [0, 1, 1]], dtype='float64') - - cat = [False, False, False] - enc = OneHotEncoder(categorical_features=cat) - with ignore_warnings(category=(DeprecationWarning, FutureWarning)): - X_tr = enc.fit_transform(X) - expected_features = np.array([], dtype='object') - assert_array_equal(X, X_tr) - assert_array_equal(enc.get_feature_names(), expected_features) - assert enc.categories_ == [] - - def test_one_hot_encoder_handle_unknown_strings(): X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1)) X2 = np.array(['55555', '22']).reshape((-1, 1)) @@ -346,6 +137,47 @@ def test_one_hot_encoder_dtype_pandas(output_dtype): assert_array_equal(oh.fit(X_df).transform(X_df), X_expected) +def test_one_hot_encoder_feature_names(): + enc = OneHotEncoder() + X = [['Male', 1, 'girl', 2, 3], + ['Female', 41, 'girl', 1, 10], + ['Male', 51, 'boy', 12, 3], + ['Male', 91, 'girl', 21, 30]] + + enc.fit(X) + feature_names = enc.get_feature_names() + assert isinstance(feature_names, np.ndarray) + + assert_array_equal(['x0_Female', 'x0_Male', + 'x1_1', 'x1_41', 'x1_51', 'x1_91', + 'x2_boy', 'x2_girl', + 'x3_1', 'x3_2', 'x3_12', 'x3_21', + 'x4_3', + 'x4_10', 'x4_30'], feature_names) + + feature_names2 = enc.get_feature_names(['one', 'two', + 'three', 'four', 'five']) + + assert_array_equal(['one_Female', 'one_Male', + 'two_1', 'two_41', 'two_51', 'two_91', + 'three_boy', 'three_girl', + 'four_1', 'four_2', 'four_12', 'four_21', + 'five_3', 'five_10', 'five_30'], feature_names2) + + with pytest.raises(ValueError, match="input_features should have length"): + enc.get_feature_names(['one', 'two']) + + +def test_one_hot_encoder_feature_names_unicode(): + enc = OneHotEncoder() + X = np.array([['c❤t1', 'dat2']], dtype=object).T + enc.fit(X) + feature_names = enc.get_feature_names() + assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names) + feature_names = enc.get_feature_names(input_features=['n👍me']) + assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names) + + def test_one_hot_encoder_set_params(): X = np.array([[1, 2]]).T oh = OneHotEncoder() @@ -428,7 +260,8 @@ def test_one_hot_encoder_inverse(sparse_, drop): # incorrect shape raises X_tr = np.array([[0, 1, 1], [1, 0, 1]]) msg = re.escape('Shape of the passed X data is not correct') - assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) + with pytest.raises(ValueError, match=msg): + enc.inverse_transform(X_tr) @pytest.mark.parametrize("method", ['fit', 'fit_transform']) @@ -687,7 +520,8 @@ def test_ordinal_encoder_inverse(): # incorrect shape raises X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) msg = re.escape('Shape of the passed X data is not correct') - assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) + with pytest.raises(ValueError, match=msg): + enc.inverse_transform(X_tr) @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, @@ -718,6 +552,7 @@ def test_ordinal_encoder_raise_categories_shape(): with pytest.raises(ValueError, match=msg): enc.fit(X) + def test_encoder_dtypes(): # check that dtypes are preserved when determining categories enc = OneHotEncoder(categories='auto') @@ -757,7 +592,7 @@ def test_encoder_dtypes_pandas(): assert_array_equal(enc.transform(X).toarray(), exp) X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]}) - X_type = [int, object, float] + X_type = [X['A'].dtype, X['B'].dtype, X['C'].dtype] enc.fit(X) assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)]) assert_array_equal(enc.transform(X).toarray(), exp) @@ -788,48 +623,31 @@ def test_one_hot_encoder_drop_manual(): enc.inverse_transform(trans)) -def test_one_hot_encoder_invalid_params(): - enc = OneHotEncoder(drop='second') - assert_raises_regex( - ValueError, - "Wrong input for parameter `drop`.", - enc.fit, [["Male"], ["Female"]]) - - enc = OneHotEncoder(handle_unknown='ignore', drop='first') - assert_raises_regex( - ValueError, - "`handle_unknown` must be 'error'", - enc.fit, [["Male"], ["Female"]]) - - enc = OneHotEncoder(drop='first') - assert_raises_regex( - ValueError, - "The handling of integer data will change in version", - enc.fit, [[1], [2]]) - - enc = OneHotEncoder(drop='first', categories='auto') - assert_no_warnings(enc.fit_transform, [[1], [2]]) - - enc = OneHotEncoder(drop=np.asarray('b', dtype=object)) - assert_raises_regex( - ValueError, - "Wrong input for parameter `drop`.", - enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]]) - - enc = OneHotEncoder(drop=['ghi', 3, 59]) - assert_raises_regex( - ValueError, - "The following categories were supposed", - enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]]) +@pytest.mark.parametrize( + "X_fit, params, err_msg", + [([["Male"], ["Female"]], {'drop': 'second'}, + "Wrong input for parameter `drop`"), + ([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'}, + "`handle_unknown` must be 'error'"), + ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]], + {'drop': np.asarray('b', dtype=object)}, + "Wrong input for parameter `drop`"), + ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]], + {'drop': ['ghi', 3, 59]}, + "The following categories were supposed")] +) +def test_one_hot_encoder_invalid_params(X_fit, params, err_msg): + enc = OneHotEncoder(**params) + with pytest.raises(ValueError, match=err_msg): + enc.fit(X_fit) @pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']]) def test_invalid_drop_length(drop): enc = OneHotEncoder(drop=drop) - assert_raises_regex( - ValueError, - "`drop` should have length equal to the number", - enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]]) + err_msg = "`drop` should have length equal to the number" + with pytest.raises(ValueError, match=err_msg): + enc.fit([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]]) @pytest.mark.parametrize("density", [True, False], @@ -854,3 +672,8 @@ def test_categories(density, drop): assert cat_list[drop_idx] == drop_cat assert isinstance(ohe_test.drop_idx_, np.ndarray) assert ohe_test.drop_idx_.dtype == np.int_ + + +@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) +def test_encoders_has_categorical_tags(Encoder): + assert 'categorical' in Encoder()._get_tags()['X_types'] \ No newline at end of file From 2d71efa02606725aeb955799a1a7469df3e1a587 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 18 Sep 2019 11:17:17 +0200 Subject: [PATCH 05/16] clean _encoders andtest_encoders --- sklearn/preprocessing/_encoders.py | 61 +++++++++++++++++++- sklearn/preprocessing/tests/test_encoders.py | 17 +++--- 2 files changed, 66 insertions(+), 12 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index ac03659d3ef23..c33744204fc36 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -19,10 +19,11 @@ ] -class _BaseEncoder(TransformerMixin, BaseEstimator): +class _BaseEncoder(BaseEstimator, TransformerMixin): """ Base class for encoders that includes the code to categorize and transform the input features. + """ def _check_X(self, X): @@ -35,6 +36,7 @@ def _check_X(self, X): constructed feature by feature to preserve the data types of pandas DataFrame columns, as otherwise information is lost and cannot be used, eg for the `categories_` attribute. + """ if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2): # if not a dataframe, do normal check_array validation @@ -148,44 +150,57 @@ def _more_tags(self): class OneHotEncoder(_BaseEncoder): """Encode categorical features as a one-hot numeric array. + The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') encoding scheme. This creates a binary column for each category and returns a sparse matrix or dense array (depending on the ``sparse`` parameter) + By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the `categories` manually. + This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. + Note: a one-hot encoding of y labels should use a LabelBinarizer instead. + Read more in the :ref:`User Guide `. + Parameters ---------- categories : 'auto' or a list of lists/arrays of values, default='auto'. Categories (unique values) per feature: + - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values within a single feature, and should be sorted in case of numeric values. + The used categories can be found in the ``categories_`` attribute. + drop : 'first' or a list/array of shape (n_features,), default=None. Specifies a methodology to use to drop one of the categories per feature. This is useful in situations where perfectly collinear features cause problems, such as when feeding the resulting data into a neural network or an unregularized regression. + - None : retain all features (the default). - 'first' : drop the first category in each feature. If only one category is present, the feature will be dropped entirely. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that should be dropped. + sparse : boolean, default=True Will return sparse matrix if set True else will return an array. + dtype : number type, default=np.float Desired dtype of output. + handle_unknown : 'error' or 'ignore', default='error'. Whether to raise an error or ignore if an unknown categorical feature is present during transform (default is to raise). When this parameter @@ -193,6 +208,7 @@ class OneHotEncoder(_BaseEncoder): transform, the resulting one-hot encoded columns for this feature will be all zeros. In the inverse transform, an unknown category will be denoted as None. + Attributes ---------- categories_ : list of arrays @@ -200,19 +216,23 @@ class OneHotEncoder(_BaseEncoder): (in order of the features in X and corresponding with the output of ``transform``). This includes the category specified in ``drop`` (if any). + drop_idx_ : array of shape (n_features,) - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to + ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to be dropped for each feature. None if all the transformed features will be retained. + Examples -------- Given a dataset with two features, we let the encoder find the unique values per feature and transform the data to a binary one-hot encoding. + >>> from sklearn.preprocessing import OneHotEncoder >>> enc = OneHotEncoder(handle_unknown='ignore') >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) OneHotEncoder(handle_unknown='ignore') + >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 1], ['Male', 4]]).toarray() @@ -229,6 +249,7 @@ class OneHotEncoder(_BaseEncoder): >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray() array([[0., 0., 0.], [1., 1., 0.]]) + See also -------- sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) @@ -304,10 +325,12 @@ def _compute_drop_idx(self): def fit(self, X, y=None): """Fit OneHotEncoder to X. + Parameters ---------- X : array-like, shape [n_samples, n_features] The data to determine the categories of each feature. + Returns ------- self @@ -319,11 +342,14 @@ def fit(self, X, y=None): def fit_transform(self, X, y=None): """Fit OneHotEncoder to X, then transform X. + Equivalent to fit(X).transform(X) but more convenient. + Parameters ---------- X : array-like, shape [n_samples, n_features] The data to encode. + Returns ------- X_out : sparse matrix if sparse=True else a 2-d array @@ -334,10 +360,12 @@ def fit_transform(self, X, y=None): def transform(self, X): """Transform X using one-hot encoding. + Parameters ---------- X : array-like, shape [n_samples, n_features] The data to encode. + Returns ------- X_out : sparse matrix if sparse=True else a 2-d array @@ -380,16 +408,20 @@ def transform(self, X): def inverse_transform(self, X): """Convert the back data to the original representation. + In case unknown categories are encountered (all zeros in the one-hot encoding), ``None`` is used to represent this category. + Parameters ---------- X : array-like or sparse matrix, shape [n_samples, n_encoded_features] The transformed data. + Returns ------- X_tr : array-like, shape [n_samples, n_features] Inverse transformed array. + """ check_is_fitted(self) X = check_array(X, accept_sparse='csr') @@ -462,14 +494,17 @@ def inverse_transform(self, X): def get_feature_names(self, input_features=None): """Return feature names for output features. + Parameters ---------- input_features : list of string, length n_features, optional String names for input features if available. By default, "x0", "x1", ... "xn_features" is used. + Returns ------- output_feature_names : array of string, length n_output_features + """ check_is_fitted(self) cats = self.categories_ @@ -494,32 +529,41 @@ def get_feature_names(self, input_features=None): class OrdinalEncoder(_BaseEncoder): """Encode categorical features as an integer array. + The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are converted to ordinal integers. This results in a single column of integers (0 to n_categories - 1) per feature. + Read more in the :ref:`User Guide `. + Parameters ---------- categories : 'auto' or a list of lists/arrays of values. Categories (unique values) per feature: + - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values, and should be sorted in case of numeric values. + The used categories can be found in the ``categories_`` attribute. + dtype : number type, default np.float64 Desired dtype of output. + Attributes ---------- categories_ : list of arrays The categories of each feature determined during fitting (in order of the features in X and corresponding with the output of ``transform``). + Examples -------- Given a dataset with two features, we let the encoder find the unique values per feature and transform the data to an ordinal encoding. + >>> from sklearn.preprocessing import OrdinalEncoder >>> enc = OrdinalEncoder() >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] @@ -530,9 +574,11 @@ class OrdinalEncoder(_BaseEncoder): >>> enc.transform([['Female', 3], ['Male', 1]]) array([[0., 2.], [1., 0.]]) + >>> enc.inverse_transform([[1, 0], [0, 1]]) array([['Male', 1], ['Female', 2]], dtype=object) + See also -------- sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of @@ -547,13 +593,16 @@ def __init__(self, categories='auto', dtype=np.float64): def fit(self, X, y=None): """Fit the OrdinalEncoder to X. + Parameters ---------- X : array-like, shape [n_samples, n_features] The data to determine the categories of each feature. + Returns ------- self + """ self._fit(X) @@ -561,28 +610,34 @@ def fit(self, X, y=None): def transform(self, X): """Transform X to ordinal codes. + Parameters ---------- X : array-like, shape [n_samples, n_features] The data to encode. + Returns ------- X_out : sparse matrix or a 2-d array Transformed input. + """ X_int, _ = self._transform(X) return X_int.astype(self.dtype, copy=False) def inverse_transform(self, X): """Convert the data back to the original representation. + Parameters ---------- X : array-like or sparse matrix, shape [n_samples, n_encoded_features] The transformed data. + Returns ------- X_tr : array-like, shape [n_samples, n_features] Inverse transformed array. + """ check_is_fitted(self) X = check_array(X, accept_sparse='csr') @@ -604,4 +659,4 @@ def inverse_transform(self, X): labels = X[:, i].astype('int64', copy=False) X_tr[:, i] = self.categories_[i][labels] - return X_tr \ No newline at end of file + return X_tr diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 4804bd03ed6b8..8e1a61781544a 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -8,6 +8,7 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_allclose from sklearn.preprocessing import OneHotEncoder @@ -260,8 +261,7 @@ def test_one_hot_encoder_inverse(sparse_, drop): # incorrect shape raises X_tr = np.array([[0, 1, 1], [1, 0, 1]]) msg = re.escape('Shape of the passed X data is not correct') - with pytest.raises(ValueError, match=msg): - enc.inverse_transform(X_tr) + assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) @pytest.mark.parametrize("method", ['fit', 'fit_transform']) @@ -520,8 +520,7 @@ def test_ordinal_encoder_inverse(): # incorrect shape raises X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) msg = re.escape('Shape of the passed X data is not correct') - with pytest.raises(ValueError, match=msg): - enc.inverse_transform(X_tr) + assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, @@ -552,7 +551,6 @@ def test_ordinal_encoder_raise_categories_shape(): with pytest.raises(ValueError, match=msg): enc.fit(X) - def test_encoder_dtypes(): # check that dtypes are preserved when determining categories enc = OneHotEncoder(categories='auto') @@ -645,9 +643,10 @@ def test_one_hot_encoder_invalid_params(X_fit, params, err_msg): @pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']]) def test_invalid_drop_length(drop): enc = OneHotEncoder(drop=drop) - err_msg = "`drop` should have length equal to the number" - with pytest.raises(ValueError, match=err_msg): - enc.fit([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]]) + assert_raises_regex( + ValueError, + "`drop` should have length equal to the number", + enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]]) @pytest.mark.parametrize("density", [True, False], @@ -676,4 +675,4 @@ def test_categories(density, drop): @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) def test_encoders_has_categorical_tags(Encoder): - assert 'categorical' in Encoder()._get_tags()['X_types'] \ No newline at end of file + assert 'categorical' in Encoder()._get_tags()['X_types'] From 8a66e43ed39a910b118d71ad47606bbd9c7f22ac Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 18 Sep 2019 11:27:48 +0200 Subject: [PATCH 06/16] clean --- .gitignore | 5 +++++ sklearn/preprocessing/label.py | 8 -------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 73feb51e76e2f..52163a5877104 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +*.ipynb +*(copy)* +*.code-workspace +*thomas* + *.pyc *.so *.pyd diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 5e8d39b25cdb7..52e7b5df4b122 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -35,14 +35,6 @@ ] -def get_encoding(uniques, values): - if np.diff(uniques) > 0: - return np.searchsorted(uniques, values) - else: - table = {val: i for i, val in enumerate(uniques)} - return np.array([table[v] for v in values]) - - def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, allow_nan=False): # only used in _encode below, see docstring there for details From c613940f8b86de64268a7922c8145d4725fe7222 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 18 Sep 2019 11:29:33 +0200 Subject: [PATCH 07/16] clean --- .gitignore | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index 52163a5877104..73feb51e76e2f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,3 @@ -*.ipynb -*(copy)* -*.code-workspace -*thomas* - *.pyc *.so *.pyd From 894c0e5dc68f35aca162fc59e37b1a9b978788d3 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 18 Sep 2019 15:04:07 +0200 Subject: [PATCH 08/16] iter --- sklearn/preprocessing/label.py | 50 ++++++++++++++--------- sklearn/preprocessing/tests/test_label.py | 4 +- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index fb3798ee44402..15df02a40986f 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -25,7 +25,7 @@ from ..utils.validation import _num_samples from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target -from ..impute._base import _get_mask +from ..utils.mask import _get_mask __all__ = [ 'label_binarize', @@ -45,7 +45,7 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, # np.nan is always sorted last if len(uniques) and is_scalar_nan(uniques[-1]): if not allow_nan: - raise ValueError('nan found in values and allow_nan=False') + raise ValueError('Values contains NaN and allow_nan=False') nan_idx = np.searchsorted(uniques, np.nan) uniques = uniques[:nan_idx+1] if encode: @@ -57,7 +57,7 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, # np.nan is always sorted last if len(uniques) and is_scalar_nan(uniques[-1]): if not allow_nan: - raise ValueError('nan found in values and allow_nan=False') + raise ValueError('Values contains NaN and allow_nan=False') nan_idx = np.searchsorted(uniques, np.nan) uniques = uniques[:nan_idx+1] return uniques @@ -79,9 +79,9 @@ def _encode_python(values, uniques=None, encode=False, allow_nan=False): missing_mask = _get_mask(values, np.nan) if np.any(missing_mask): if not allow_nan: - raise ValueError('nan found in values and allow_nan=False') + raise ValueError('Values contains NaN and allow_nan=False') else: - # sorted([4, np.nan]) != np.sort([4, np.nan]) + # need np.sort to ensure nan is sorted last uniques = np.sort(list(set(values[~missing_mask]) | {np.nan})) else: uniques = sorted(set(values)) @@ -194,16 +194,20 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): """ if values.dtype == object: uniques_set = set(uniques) - diff = np.array(list(set(values) - uniques_set)) - # set([np.nan]) - set([np.nan]) returns set() - # but set(np.array([np.nan])) - set(np.array([np.nan])) return {nan} - if len(diff) and any(_get_mask(diff, np.nan)): + values_set = set(values) + array_values_set = np.array(values_set) + is_nan_in_value = np.any(_object_dtype_isnan(array_values_set)) + if is_nan_in_value: if not allow_nan: - raise ValueError('Nan found during check_unknown') + raise ValueError('Values contains NaN') + elif any(_get_mask(uniques, np.nan)): + diff = np.array(array_values_set - uniques_set) + diff = diff[~_get_mask(diff, np.nan)] else: - if any(_get_mask(uniques_set, np.nan)) and\ - any(_get_mask(set(values), np.nan)): - diff = diff[~_get_mask(diff, np.nan)] + diff = list(values_set - uniques_set) + else: + diff = list(values_set - uniques_set) + if return_mask: if len(diff): valid_mask = np.array([val in uniques_set for val in values]) @@ -214,15 +218,21 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): return diff else: unique_values = np.unique(values) - diff = np.setdiff1d(unique_values, uniques, assume_unique=True) - # np.setdiff1d([np.nan],[np.nan]) returns [np.nan] - if any(_get_mask(diff, np.nan)): + mask_nan_in_values = _get_mask(unique_values, np.nan) + if np.any(mask_nan_in_values): if not allow_nan: - raise ValueError('Nan found during check_unknown') + raise ValueError('Values conatins NaN') else: - if any(_get_mask(unique_values, np.nan)) and\ - any(_get_mask(uniques, np.nan)): - diff = [x for x in diff if not is_scalar_nan(x)] + mask_nan_in_uniques = _get_mask(uniques, np.nan) + if np.any(mask_nan_in_uniques): + diff = np.setdiff1d(unique_values[~mask_nan_in_values], + uniques[~mask_nan_in_uniques], + assume_unique=True) + else: + diff = np.setdiff1d(unique_values, uniques, assume_unique=True) + else: + diff = np.setdiff1d(unique_values, uniques, assume_unique=True) + if return_mask: if diff: valid_mask = np.in1d(values, uniques) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 52fe83e0a83c8..43169776a701d 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -242,7 +242,9 @@ def test_label_encode_with_nan(): [np.asarray([np.nan, np.nan], dtype=float), np.asarray([np.nan, np.nan], dtype=object)]) def test_label_encode_raise_nan(values): - assert_raises(ValueError, _encode, values, allow_nan=False) + msg = 'Values contains NaN' + with pytest.raises(ValueError, match=msg): + _encode(values, allow_nan=False) @pytest.mark.parametrize("dtype", ['str', 'object']) From 1a266b0f8cc0859e2dde5f8f693bf8dd8c197128 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 18 Sep 2019 15:12:24 +0200 Subject: [PATCH 09/16] typo --- sklearn/preprocessing/label.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 15df02a40986f..bb41359713ac0 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -229,7 +229,8 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): uniques[~mask_nan_in_uniques], assume_unique=True) else: - diff = np.setdiff1d(unique_values, uniques, assume_unique=True) + diff = np.setdiff1d(unique_values, uniques, + assume_unique=True) else: diff = np.setdiff1d(unique_values, uniques, assume_unique=True) From afad176b6a1b709df5bf73315122d99efed1ca29 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 18 Sep 2019 17:04:09 +0200 Subject: [PATCH 10/16] add functions --- sklearn/preprocessing/label.py | 100 +++++++++++++--------- sklearn/preprocessing/tests/test_label.py | 31 +++++++ 2 files changed, 90 insertions(+), 41 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index bb41359713ac0..2294b6f894035 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -35,31 +35,43 @@ ] +def _nan_unique(ar, return_inverse=False, allow_nan=False): + # mimic np.unique with allow_nan option + + if return_inverse: + uniques, inverse = np.unique(ar, return_inverse=True) + else: + uniques = np.unique(ar) + + nan_idx = None + # np.nan is always sorted last + if len(uniques) and is_scalar_nan(uniques[-1]): + if not allow_nan: + raise ValueError('Values contains NaN and allow_nan=False') + nan_idx = np.searchsorted(uniques, np.nan) + uniques = uniques[:nan_idx+1] + + if return_inverse and nan_idx is not None: + inverse[inverse > nan_idx] = nan_idx + + if return_inverse: + return uniques, inverse + else: + return uniques + + def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, allow_nan=False): # only used in _encode below, see docstring there for details if uniques is None: if encode: - uniques, encoded = np.unique(values, return_inverse=True) - # np.nan is always sorted last - if len(uniques) and is_scalar_nan(uniques[-1]): - if not allow_nan: - raise ValueError('Values contains NaN and allow_nan=False') - nan_idx = np.searchsorted(uniques, np.nan) - uniques = uniques[:nan_idx+1] - if encode: - encoded[encoded > nan_idx] = nan_idx + uniques, encoded = _nan_unique(values, return_inverse=True, + allow_nan=allow_nan) return uniques, encoded else: # unique sorts - uniques = np.unique(values) - # np.nan is always sorted last - if len(uniques) and is_scalar_nan(uniques[-1]): - if not allow_nan: - raise ValueError('Values contains NaN and allow_nan=False') - nan_idx = np.searchsorted(uniques, np.nan) - uniques = uniques[:nan_idx+1] + uniques = _nan_unique(values, allow_nan=allow_nan) return uniques if encode: if check_unknown: @@ -73,6 +85,25 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, return uniques +class TableWithNan(object): + #  hash table which allows nan as a key + + def __init__(self): + self.dict = dict() + self.nan_value = None + + def get(self, key): + if is_scalar_nan(key) and self.nan_value is not None: + return self.nan_value + return self.dict[key] + + def set(self, key, value): + if is_scalar_nan(key): + self.nan_value = value + else: + self.dict[key] = value + + def _encode_python(values, uniques=None, encode=False, allow_nan=False): # only used in _encode below, see docstring there for details if uniques is None: @@ -87,29 +118,15 @@ def _encode_python(values, uniques=None, encode=False, allow_nan=False): uniques = sorted(set(values)) uniques = np.array(uniques, dtype=values.dtype) if encode: - table = dict() + # hash is not enough to identify nan + table = TableWithNan() for i, val in enumerate(uniques): - if is_scalar_nan(val): - # table[nan] always raise KeyError - nan_index = i - else: - table[val] = i + table.set(val, i) try: - encoded = [] - for val in values: - if is_scalar_nan(val): - encoded.append(nan_index) - else: - encoded.append(table[val]) - encoded = np.array(encoded) + encoded = np.array([table.get(val) for val in values]) except KeyError as e: raise ValueError("y contains previously unseen labels: %s" % str(e)) - except UnboundLocalError as e: - # 'nan_index' referenced before assignment - raise ValueError("y contains previously unseen label nan, " - "consider using allow_nan=True. %s" - % str(e)) return uniques, encoded else: return uniques @@ -143,7 +160,7 @@ def _encode(values, uniques=None, encode=False, check_unknown=True, _BaseEncoder._transform() to avoid calling _encode_check_unknown() twice. allow_nan : bool, default False - if True, encode np.nan as another category. Otherwise raise an error + if True, encode `np.nan` as another category. Otherwise raise an error if nan are present Returns @@ -182,6 +199,8 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): return_mask : bool, default False If True, return a mask of the same shape as `values` indicating the valid values. + allow_nan : bool, default False + If False, raise an error if NaN are present. Returns ------- @@ -195,21 +214,20 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): if values.dtype == object: uniques_set = set(uniques) values_set = set(values) - array_values_set = np.array(values_set) - is_nan_in_value = np.any(_object_dtype_isnan(array_values_set)) + is_nan_in_value = any([is_scalar_nan(val) for val in values_set]) if is_nan_in_value: if not allow_nan: raise ValueError('Values contains NaN') elif any(_get_mask(uniques, np.nan)): - diff = np.array(array_values_set - uniques_set) - diff = diff[~_get_mask(diff, np.nan)] + diff = np.array(values_set - uniques_set) + diff = list(diff[~_get_mask(diff, np.nan)]) else: diff = list(values_set - uniques_set) else: diff = list(values_set - uniques_set) if return_mask: - if len(diff): + if diff: valid_mask = np.array([val in uniques_set for val in values]) else: valid_mask = np.ones(len(values), dtype=bool) @@ -221,7 +239,7 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): mask_nan_in_values = _get_mask(unique_values, np.nan) if np.any(mask_nan_in_values): if not allow_nan: - raise ValueError('Values conatins NaN') + raise ValueError('Values contains NaN') else: mask_nan_in_uniques = _get_mask(uniques, np.nan) if np.any(mask_nan_in_uniques): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 43169776a701d..7d2a24d729408 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -23,6 +23,7 @@ from sklearn.preprocessing.label import _inverse_binarize_thresholding from sklearn.preprocessing.label import _inverse_binarize_multiclass from sklearn.preprocessing.label import _encode +from sklearn.preprocessing.label import _encode_check_unknown from sklearn import datasets @@ -718,3 +719,33 @@ def test_encode_check_unknown_nan_object(uniques, values): match='y contains previously unseen label'): _encode(values, uniques, encode=True, check_unknown=True, allow_nan=True) + + +@pytest.mark.parametrize("return_mask", [True, False]) +@pytest.mark.parametrize( + "uniques, values", + [(np.array(['a', 'b', 'c'], dtype=object), + np.array(['a', 'b', 'c', np.nan], dtype=object)), + (np.array([np.nan, 'b', 'c'], dtype=object), + np.array([np.nan, 'b', 'c', 'd'], dtype=object)), + (np.array([1, 2, 3]), + np.array([1, 2, 3, np.nan])), + (np.array([np.nan, 2, 3]), + np.array([np.nan, 2, 3, 4]))]) +def test_check_unknown_nan_raise(uniques, values, return_mask): + # test for the check_unknown parameter of _encode() with nan present + + with pytest.raises(ValueError, + match='Values contains NaN'): + _encode_check_unknown(values, uniques, return_mask=return_mask, + allow_nan=False) + + +def test_nan_unique(): + # TODO + pass + + +def test_table_with_nan(): + # TODO + pass From db0513699346589ff8cd6cd7ec1f52a80b50befc Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 18 Sep 2019 17:19:23 +0200 Subject: [PATCH 11/16] ad test --- sklearn/preprocessing/label.py | 1 - sklearn/preprocessing/tests/test_label.py | 67 +++++++++++------------ 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 2294b6f894035..764c6932fc404 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -19,7 +19,6 @@ from ..utils.sparsefuncs import min_max_axis from ..utils import column_or_1d from ..utils import is_scalar_nan -from ..utils.fixes import _object_dtype_isnan from ..utils.validation import check_array from ..utils.validation import check_is_fitted from ..utils.validation import _num_samples diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 7d2a24d729408..9bcebea018bcb 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -219,35 +219,6 @@ def test_label_encoder_negative_ints(): le.transform([0, 6]) -def test_label_encode_with_nan(): - - # encode all nan within one category - assert len(_encode(np.asarray([np.nan, np.nan], dtype=float), - allow_nan=True)) == 1 - assert len(_encode(np.asarray([np.nan, np.nan], dtype=object), - allow_nan=True)) == 1 - assert len(_encode(np.asarray([4, np.nan, np.nan]), allow_nan=True)) == 2 - - # the encoded size corresponds to the values size - assert len(_encode(np.asarray([np.nan, np.nan], dtype=float), - encode=True, allow_nan=True)[1]) == 2 - assert len(_encode(np.asarray([np.nan, np.nan], dtype=object), - encode=True, allow_nan=True)[1]) == 2 - - encoded = _encode(np.asarray([4, 5, np.nan, np.nan, np.nan]), - encode=True, allow_nan=True)[1] - assert_array_equal(encoded, [0, 1, 2, 2, 2]) - - -@pytest.mark.parametrize("values", - [np.asarray([np.nan, np.nan], dtype=float), - np.asarray([np.nan, np.nan], dtype=object)]) -def test_label_encode_raise_nan(values): - msg = 'Values contains NaN' - with pytest.raises(ValueError, match=msg): - _encode(values, allow_nan=False) - - @pytest.mark.parametrize("dtype", ['str', 'object']) def test_label_encoder_str_bad_shape(dtype): le = LabelEncoder() @@ -642,6 +613,7 @@ def test_inverse_binarize_multiclass(): assert_array_equal(got, np.array([1, 1, 0])) +@pytest.mark.parametrize("allow_nan", [True, False]) @pytest.mark.parametrize( "values, expected", [(np.array([2, 1, 3, 1, 3], dtype='int64'), @@ -651,19 +623,44 @@ def test_inverse_binarize_multiclass(): (np.array(['b', 'a', 'c', 'a', 'c']), np.array(['a', 'b', 'c']))], ids=['int64', 'object', 'str']) -def test_encode_util(values, expected): +def test_encode_util(values, expected, allow_nan): uniques = _encode(values) assert_array_equal(uniques, expected) - uniques, encoded = _encode(values, encode=True) + uniques, encoded = _encode(values, encode=True, allow_nan=allow_nan) assert_array_equal(uniques, expected) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) - _, encoded = _encode(values, uniques, encode=True) + _, encoded = _encode(values, uniques, encode=True, allow_nan=allow_nan) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) -@pytest.mark.parametrize( - "allow_nan", - [True, False]) +@pytest.mark.parametrize("dtype", [float, object]) +def test_label_encode_with_nan(dtype): + + # encode all nan within one category + assert len(_encode(np.asarray([np.nan, np.nan, float('nan')], dtype=dtype), + allow_nan=True)) == 1 + assert len(_encode(np.asarray([4, np.nan, float('nan')], dtype=dtype), + allow_nan=True)) == 2 + + # the encoded size corresponds to the values size + assert len(_encode(np.asarray([np.nan, np.nan], dtype=dtype), + encode=True, allow_nan=True)[1]) == 2 + + encoded = _encode(np.asarray([4, 5, np.nan, np.nan, np.nan], dtype=dtype), + encode=True, allow_nan=True)[1] + assert_array_equal(encoded, [0, 1, 2, 2, 2]) + + +@pytest.mark.parametrize("values", + [np.asarray([np.nan, np.nan], dtype=float), + np.asarray([np.nan, np.nan], dtype=object)]) +def test_label_encode_raise_nan(values): + msg = 'Values contains NaN' + with pytest.raises(ValueError, match=msg): + _encode(values, allow_nan=False) + + +@pytest.mark.parametrize("allow_nan", [True, False]) def test_encode_check_unknown(allow_nan): # test for the check_unknown parameter of _encode() uniques = np.array([1, 2, 3]) From b7284f69d874fc01158205caf4f2308b5508ea1b Mon Sep 17 00:00:00 2001 From: twsthomas Date: Thu, 19 Sep 2019 15:23:27 +0200 Subject: [PATCH 12/16] add more tests --- sklearn/preprocessing/label.py | 4 +- sklearn/preprocessing/tests/test_label.py | 106 ++++++++++++++++++++-- 2 files changed, 102 insertions(+), 8 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 764c6932fc404..a2282f3ed6d7e 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -84,7 +84,7 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, return uniques -class TableWithNan(object): +class _TableWithNan(object): #  hash table which allows nan as a key def __init__(self): @@ -118,7 +118,7 @@ def _encode_python(values, uniques=None, encode=False, allow_nan=False): uniques = np.array(uniques, dtype=values.dtype) if encode: # hash is not enough to identify nan - table = TableWithNan() + table = _TableWithNan() for i, val in enumerate(uniques): table.set(val, i) try: diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 9bcebea018bcb..5e7ef6103d8d8 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -10,6 +10,7 @@ from scipy.sparse import lil_matrix from sklearn.utils.multiclass import type_of_target +from sklearn.utils import is_scalar_nan from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_warns_message @@ -22,8 +23,10 @@ from sklearn.preprocessing.label import _inverse_binarize_thresholding from sklearn.preprocessing.label import _inverse_binarize_multiclass -from sklearn.preprocessing.label import _encode +from sklearn.preprocessing.label import _encode, _encode_numpy, _encode_python from sklearn.preprocessing.label import _encode_check_unknown +from sklearn.preprocessing.label import _nan_unique +from sklearn.preprocessing.label import _TableWithNan from sklearn import datasets @@ -738,11 +741,102 @@ def test_check_unknown_nan_raise(uniques, values, return_mask): allow_nan=False) -def test_nan_unique(): - # TODO - pass +@pytest.mark.parametrize( + ["values", "unique", "inverse"], + [(np.array([]), [], []), + (np.array(['a', 'a', 'a'], dtype=object), ['a'], [0, 0, 0]), + (np.array(['a', 'c', 'b'], dtype=object), ['a', 'b', 'c'], [0, 2, 1]), + (np.array(['a', 'b', 'c', 'a', 'b'], dtype=object), ['a', 'b', 'c'], + [0, 1, 2, 0, 1]), + (np.array([1, 2, 3]), [1, 2, 3], [0, 1, 2]), + (np.array([1, 1, 1]), [1], [0, 0, 0]), + (np.array([1, 2, 3, 3, 2, 1]), [1, 2, 3], [0, 1, 2, 2, 1, 0]), + ]) +def test_nan_unique_same_as_np(values, unique, inverse): + #  assert _nan_unique == np.unique + + assert_array_equal(unique, _nan_unique(values)) + assert_array_equal(unique, np.unique(values)) + + u, i = _nan_unique(values, return_inverse=True) + assert_array_equal(unique, u) + assert_array_equal(inverse, i) + u, i = np.unique(values, return_inverse=True) + assert_array_equal(unique, u) + assert_array_equal(inverse, i) + + +@pytest.mark.parametrize( + ["values", "unique", "inverse"], + [(np.array([]), [], []), + (np.array([np.nan, np.nan, float('nan')]), [np.nan], [0, 0, 0]), + # (np.array([np.nan, 'a', 'a'], dtype=object), + # ['a', np.nan], [1, 0, 0]), + # (np.array([np.nan, 'c', 'b'], dtype=object), + # ['b', 'c', np.nan], [0, 2, 1]), + # (np.array([np.nan, 'b', 'c', 'a', 'b'], dtype=object), + # ['a', 'b', 'c', np.nan], [3, 1, 2, 0, 1]), + (np.array([np.nan, 2, 3]), [2, 3, np.nan], [2, 0, 1]), + (np.array([np.nan, 1, 1]), [1, np.nan], [1, 0, 0]), + (np.array([np.nan, 2, 3, 3, 2, 1]), [1, 2, 3, np.nan], + [3, 1, 2, 2, 1, 0]), + ]) +def test_nan_unique_nan(values, unique, inverse): + nan_unique, nan_inverse = _nan_unique(values, return_inverse=True, + allow_nan=True) + for nu, u in zip(nan_unique, unique): + if is_scalar_nan(nu): + assert is_scalar_nan(u) + else: + assert nu == u + for ni, i in zip(nan_inverse, inverse): + if is_scalar_nan(ni): + assert is_scalar_nan(i) + else: + assert ni == i + + +@pytest.mark.parametrize('encode_type', [_encode_numpy, _encode_python]) +@pytest.mark.parametrize( + ["values", "unique", "inverse"], + [(np.array([]), [], []), + (np.array([np.nan, np.nan, float('nan')]), [np.nan], [0, 0, 0]), + (np.array([np.nan, 2, 3]), [2, 3, np.nan], [2, 0, 1]), + (np.array([np.nan, 1, 1]), [1, np.nan], [1, 0, 0]), + (np.array([np.nan, 2, 3, 3, 2, 1]), [1, 2, 3, np.nan], + [3, 1, 2, 2, 1, 0]), + ]) +def test_nan_encode_numpy_python(values, unique, inverse, encode_type): + nan_unique, nan_inverse = encode_type(values, encode=True, allow_nan=True) + for nu, u in zip(nan_unique, unique): + if is_scalar_nan(nu): + assert is_scalar_nan(u) + else: + assert nu == u + for ni, i in zip(nan_inverse, inverse): + if is_scalar_nan(ni): + assert is_scalar_nan(i) + else: + assert ni == i def test_table_with_nan(): - # TODO - pass + table = _TableWithNan() + table.set('a', 0) + table.set(42, 42) + + with pytest.raises(KeyError): + table.get(np.nan) + with pytest.raises(KeyError): + table.get(float('nan')) + with pytest.raises(KeyError): + table.get('b') + + table.set(np.nan, 1) + assert table.get('a') == 0 + assert table.get(42) == 42 + assert table.get(np.nan) == 1 + assert table.get(float('nan')) == 1 + + with pytest.raises(KeyError): + table.get(None) From c4c4982180716f87470fabe93a4a6b4eac1d46a2 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Thu, 19 Sep 2019 16:34:35 +0200 Subject: [PATCH 13/16] add more tests --- sklearn/preprocessing/label.py | 11 +- sklearn/preprocessing/tests/test_label.py | 122 +++++++++++++++------- 2 files changed, 93 insertions(+), 40 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index a2282f3ed6d7e..d75e00330f45a 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -217,11 +217,14 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): if is_nan_in_value: if not allow_nan: raise ValueError('Values contains NaN') - elif any(_get_mask(uniques, np.nan)): - diff = np.array(values_set - uniques_set) - diff = list(diff[~_get_mask(diff, np.nan)]) + if any(_get_mask(uniques, np.nan)): + diff = list(values_set - uniques_set) + if diff: + diff = np.array(diff) + diff = list(diff[~_get_mask(diff, np.nan)]) else: diff = list(values_set - uniques_set) + # diff = [] ### else: diff = list(values_set - uniques_set) @@ -252,7 +255,7 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): diff = np.setdiff1d(unique_values, uniques, assume_unique=True) if return_mask: - if diff: + if len(diff): valid_mask = np.in1d(values, uniques) else: valid_mask = np.ones(len(values), dtype=bool) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 5e7ef6103d8d8..4764e04cbd975 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -664,11 +664,18 @@ def test_label_encode_raise_nan(values): @pytest.mark.parametrize("allow_nan", [True, False]) -def test_encode_check_unknown(allow_nan): +@pytest.mark.parametrize( + "uniques, values", + [(np.array(['a', 'b', 'c'], dtype=object), + np.array(['a', 'b', 'c', 'd'], dtype=object)), + (np.array([], dtype=object), + np.array([1], dtype=object)), + (np.array([], dtype=float), + np.array([1], dtype=float)), + (np.array([1, 2, 3]), + np.array([1, 2, 3, 4]))]) +def test_encode_check_unknown(values, uniques, allow_nan): # test for the check_unknown parameter of _encode() - uniques = np.array([1, 2, 3]) - values = np.array([1, 2, 3, 4]) - # Default is True, raise error with pytest.raises(ValueError, match='y contains previously unseen labels'): @@ -676,16 +683,10 @@ def test_encode_check_unknown(allow_nan): allow_nan=allow_nan) # dont raise error if False - _encode(values, uniques, encode=True, check_unknown=False, - allow_nan=allow_nan) - - # parameter is ignored for object dtype - uniques = np.array(['a', 'b', 'c'], dtype=object) - values = np.array(['a', 'b', 'c', 'd'], dtype=object) - with pytest.raises(ValueError, - match='y contains previously unseen labels'): - _encode(values, uniques, encode=True, check_unknown=False, - allow_nan=allow_nan) + # check_unknown is always True for dtype object + if values.dtype != object: + _encode(values, uniques, encode=True, check_unknown=False, + allow_nan=allow_nan) @pytest.mark.parametrize( @@ -741,8 +742,67 @@ def test_check_unknown_nan_raise(uniques, values, return_mask): allow_nan=False) +@pytest.mark.parametrize('allow_nan', [True, False]) @pytest.mark.parametrize( - ["values", "unique", "inverse"], + "values, uniques, diff, mask", + [(np.array(['a', 'a', 'a'], dtype=object), ['a'], [], [1, 1, 1]), + (np.array(['a', 'c', 'b'], dtype=object), ['a', 'b', 'c'], [], + [1, 1, 1]), + (np.array(['a', 'b', 'c', 'a', 'b'], dtype=object), ['a', 'b', 'c'], + [], [1, 1, 1, 1, 1]), + (np.array([1, 2, 3]), [1, 2, 3], [], [1, 1, 1]), + (np.array([1, 1, 1]), [1], [], [1, 1, 1]), + (np.array([1, 2, 3, 3, 2, 1]), [1, 2, 3], [], [1] * 6), + ]) +def test_encode_check_unknown_diff(values, uniques, diff, mask, allow_nan): + + diff_, mask_ = _encode_check_unknown(values, uniques, return_mask=True, + allow_nan=allow_nan) + assert_array_equal(diff, diff_) + assert_array_equal(mask, mask_) + + +@pytest.mark.parametrize( + "values, uniques, diff, mask", + [(np.array([1, 2, np.nan]), np.array([1, 2, np.nan]), [], [1, 1, 1]), + (np.array([1, 1, float('nan')]), np.array([1, np.nan]), [], [1, 1, 1]), + (np.array([1, np.nan, 3, 3, 2, 1]), np.array([1, 2, 3, np.nan]), + [], [1] * 6), + ]) +def test_encode_check_unknown_diff_with_nan(values, uniques, diff, mask): + + diff_, mask_ = _encode_check_unknown(values, uniques, return_mask=True, + allow_nan=True) + assert_array_equal(diff, diff_) + assert_array_equal(mask, mask_) + + +def assert_array_equal_with_nan(x, y): + for a, b in zip(x, y): + if is_scalar_nan(a): + assert is_scalar_nan(b) + else: + assert a == b + + +@pytest.mark.parametrize( + "values, uniques, diff, mask", + [(np.array([1, 2, np.nan]), np.array([1, 2]), [np.nan], [1, 1, 0]), + (np.array([np.nan, float('nan')]), np.array([9]), [np.nan], [0, 0]), + (np.array([np.nan, 1, 1]), np.array([1]), [float('nan')], [0, 1, 1]), + (np.array([1, np.nan, 3, 3, 2, 1]), np.array([1, 2, 3]), + [], [1, 0, 1, 1, 1, 1]), + ]) +def test_encode_check_unknown_diff_nan_unseen(values, uniques, diff, mask): + + diff_, mask_ = _encode_check_unknown(values, uniques, return_mask=True, + allow_nan=True) + assert_array_equal_with_nan(mask, mask_) + assert_array_equal_with_nan(diff, diff_) + + +@pytest.mark.parametrize( + "values, unique, inverse", [(np.array([]), [], []), (np.array(['a', 'a', 'a'], dtype=object), ['a'], [0, 0, 0]), (np.array(['a', 'c', 'b'], dtype=object), ['a', 'b', 'c'], [0, 2, 1]), @@ -767,7 +827,7 @@ def test_nan_unique_same_as_np(values, unique, inverse): @pytest.mark.parametrize( - ["values", "unique", "inverse"], + "values, unique, inverse", [(np.array([]), [], []), (np.array([np.nan, np.nan, float('nan')]), [np.nan], [0, 0, 0]), # (np.array([np.nan, 'a', 'a'], dtype=object), @@ -784,16 +844,8 @@ def test_nan_unique_same_as_np(values, unique, inverse): def test_nan_unique_nan(values, unique, inverse): nan_unique, nan_inverse = _nan_unique(values, return_inverse=True, allow_nan=True) - for nu, u in zip(nan_unique, unique): - if is_scalar_nan(nu): - assert is_scalar_nan(u) - else: - assert nu == u - for ni, i in zip(nan_inverse, inverse): - if is_scalar_nan(ni): - assert is_scalar_nan(i) - else: - assert ni == i + assert_array_equal_with_nan(nan_unique, unique) + assert_array_equal_with_nan(nan_inverse, inverse) @pytest.mark.parametrize('encode_type', [_encode_numpy, _encode_python]) @@ -808,16 +860,14 @@ def test_nan_unique_nan(values, unique, inverse): ]) def test_nan_encode_numpy_python(values, unique, inverse, encode_type): nan_unique, nan_inverse = encode_type(values, encode=True, allow_nan=True) - for nu, u in zip(nan_unique, unique): - if is_scalar_nan(nu): - assert is_scalar_nan(u) - else: - assert nu == u - for ni, i in zip(nan_inverse, inverse): - if is_scalar_nan(ni): - assert is_scalar_nan(i) - else: - assert ni == i + assert_array_equal_with_nan(nan_unique, unique) + assert_array_equal_with_nan(nan_inverse, inverse) + + # test also _nan_unique + nan_unique, nan_inverse = _nan_unique(values, return_inverse=True, + allow_nan=True) + assert_array_equal_with_nan(nan_unique, unique) + assert_array_equal_with_nan(nan_inverse, inverse) def test_table_with_nan(): From 6d9d0554282cce28fa0cd58e3536eb61d7edc897 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Mon, 23 Sep 2019 17:40:16 +0200 Subject: [PATCH 14/16] rename __DictWithNan --- sklearn/preprocessing/label.py | 30 ++++++---- sklearn/preprocessing/tests/test_label.py | 68 ++++++++++++----------- 2 files changed, 53 insertions(+), 45 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index d75e00330f45a..c14ccba48a272 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -35,7 +35,13 @@ def _nan_unique(ar, return_inverse=False, allow_nan=False): - # mimic np.unique with allow_nan option + """ mimic np.unique where all nan are treated as the same one + + If allow_nan is False, ValueError is raise if ar contains nan. + Otherwise, if `ar` contains (possibly some) nan, + `uniques` will contains only one nan (contrary to np.unique), and + `inverse` will map all the nan from `ar` to this single nan in `uniques`. + """ if return_inverse: uniques, inverse = np.unique(ar, return_inverse=True) @@ -84,23 +90,23 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, return uniques -class _TableWithNan(object): - #  hash table which allows nan as a key +class _DictWithNan(dict): + # dict which allows nan as a key def __init__(self): - self.dict = dict() self.nan_value = None - def get(self, key): + def __getitem__(self, key): if is_scalar_nan(key) and self.nan_value is not None: return self.nan_value - return self.dict[key] + else: + return self.__dict__[key] - def set(self, key, value): + def __setitem__(self, key, item): if is_scalar_nan(key): - self.nan_value = value + self.nan_value = item else: - self.dict[key] = value + self.__dict__[key] = item def _encode_python(values, uniques=None, encode=False, allow_nan=False): @@ -118,11 +124,11 @@ def _encode_python(values, uniques=None, encode=False, allow_nan=False): uniques = np.array(uniques, dtype=values.dtype) if encode: # hash is not enough to identify nan - table = _TableWithNan() + table = _DictWithNan() for i, val in enumerate(uniques): - table.set(val, i) + table[val] = i try: - encoded = np.array([table.get(val) for val in values]) + encoded = np.array([table[val] for val in values]) except KeyError as e: raise ValueError("y contains previously unseen labels: %s" % str(e)) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 4764e04cbd975..52e071986d7fc 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -26,7 +26,7 @@ from sklearn.preprocessing.label import _encode, _encode_numpy, _encode_python from sklearn.preprocessing.label import _encode_check_unknown from sklearn.preprocessing.label import _nan_unique -from sklearn.preprocessing.label import _TableWithNan +from sklearn.preprocessing.label import _DictWithNan from sklearn import datasets @@ -636,24 +636,6 @@ def test_encode_util(values, expected, allow_nan): assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) -@pytest.mark.parametrize("dtype", [float, object]) -def test_label_encode_with_nan(dtype): - - # encode all nan within one category - assert len(_encode(np.asarray([np.nan, np.nan, float('nan')], dtype=dtype), - allow_nan=True)) == 1 - assert len(_encode(np.asarray([4, np.nan, float('nan')], dtype=dtype), - allow_nan=True)) == 2 - - # the encoded size corresponds to the values size - assert len(_encode(np.asarray([np.nan, np.nan], dtype=dtype), - encode=True, allow_nan=True)[1]) == 2 - - encoded = _encode(np.asarray([4, 5, np.nan, np.nan, np.nan], dtype=dtype), - encode=True, allow_nan=True)[1] - assert_array_equal(encoded, [0, 1, 2, 2, 2]) - - @pytest.mark.parametrize("values", [np.asarray([np.nan, np.nan], dtype=float), np.asarray([np.nan, np.nan], dtype=object)]) @@ -765,7 +747,8 @@ def test_encode_check_unknown_diff(values, uniques, diff, mask, allow_nan): @pytest.mark.parametrize( "values, uniques, diff, mask", [(np.array([1, 2, np.nan]), np.array([1, 2, np.nan]), [], [1, 1, 1]), - (np.array([1, 1, float('nan')]), np.array([1, np.nan]), [], [1, 1, 1]), + (np.array([1, 1, float('nan')]), np.array([1, np.nan]), + [], [1, 1, 1]), (np.array([1, np.nan, 3, 3, 2, 1]), np.array([1, 2, 3, np.nan]), [], [1] * 6), ]) @@ -785,6 +768,25 @@ def assert_array_equal_with_nan(x, y): assert a == b +@pytest.mark.parametrize( + "values, uniques, encoded", + [(np.array([4, np.nan, float('nan')]), [4, np.nan], + [0, 1, 1]), + (np.array([np.nan, float('nan')]), [np.nan], + [0, 0]), + (np.array([np.nan, 4, np.nan, 4]), [4, np.nan], + [1, 0, 1, 0]), + (np.array([np.nan]), [np.nan], [0]), + ]) +def test_label_encode_with_nan(values, uniques, encoded): + + assert_array_equal_with_nan(_encode(values, allow_nan=True), uniques) + + uniques_, encoded_ = _encode(values, encode=True, allow_nan=True) + assert_array_equal_with_nan(uniques, uniques_) + assert_array_equal_with_nan(encoded, encoded_) + + @pytest.mark.parametrize( "values, uniques, diff, mask", [(np.array([1, 2, np.nan]), np.array([1, 2]), [np.nan], [1, 1, 0]), @@ -870,23 +872,23 @@ def test_nan_encode_numpy_python(values, unique, inverse, encode_type): assert_array_equal_with_nan(nan_inverse, inverse) -def test_table_with_nan(): - table = _TableWithNan() - table.set('a', 0) - table.set(42, 42) +def test_dict_with_nan(): + table = _DictWithNan() + table['a'] = 0 + table[42] = 42 with pytest.raises(KeyError): - table.get(np.nan) + table[np.nan] with pytest.raises(KeyError): - table.get(float('nan')) + table[float('nan')] with pytest.raises(KeyError): - table.get('b') + table['b'] - table.set(np.nan, 1) - assert table.get('a') == 0 - assert table.get(42) == 42 - assert table.get(np.nan) == 1 - assert table.get(float('nan')) == 1 + table[np.nan] = 1 + assert table['a'] == 0 + assert table[42] == 42 + assert table[np.nan] == 1 + assert table[float('nan')] == 1 with pytest.raises(KeyError): - table.get(None) + table[None] From c76b42b70ceb4ff9847f6f4e7b4fe3c73cb71836 Mon Sep 17 00:00:00 2001 From: twsthomas Date: Mon, 23 Sep 2019 17:57:12 +0200 Subject: [PATCH 15/16] typo --- sklearn/preprocessing/tests/test_label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 52e071986d7fc..b6886e87d943c 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -667,8 +667,8 @@ def test_encode_check_unknown(values, uniques, allow_nan): # dont raise error if False # check_unknown is always True for dtype object if values.dtype != object: - _encode(values, uniques, encode=True, check_unknown=False, - allow_nan=allow_nan) + _encode(values, uniques, encode=True, check_unknown=False, + allow_nan=allow_nan) @pytest.mark.parametrize( From f3a120d2a328b1d9b7b576b1444a395e823cb6fb Mon Sep 17 00:00:00 2001 From: twsthomas Date: Wed, 25 Sep 2019 11:15:48 +0200 Subject: [PATCH 16/16] minor --- sklearn/preprocessing/label.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index c14ccba48a272..cd320455f8970 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -259,6 +259,7 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): assume_unique=True) else: diff = np.setdiff1d(unique_values, uniques, assume_unique=True) + diff = list(diff) if return_mask: if len(diff):