diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 74511dea6ac15..cd320455f8970 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -18,12 +18,13 @@ from ..utils.sparsefuncs import min_max_axis from ..utils import column_or_1d +from ..utils import is_scalar_nan from ..utils.validation import check_array from ..utils.validation import check_is_fitted from ..utils.validation import _num_samples from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target - +from ..utils.mask import _get_mask __all__ = [ 'label_binarize', @@ -33,18 +34,53 @@ ] -def _encode_numpy(values, uniques=None, encode=False, check_unknown=True): +def _nan_unique(ar, return_inverse=False, allow_nan=False): + """ mimic np.unique where all nan are treated as the same one + + If allow_nan is False, ValueError is raise if ar contains nan. + Otherwise, if `ar` contains (possibly some) nan, + `uniques` will contains only one nan (contrary to np.unique), and + `inverse` will map all the nan from `ar` to this single nan in `uniques`. + """ + + if return_inverse: + uniques, inverse = np.unique(ar, return_inverse=True) + else: + uniques = np.unique(ar) + + nan_idx = None + # np.nan is always sorted last + if len(uniques) and is_scalar_nan(uniques[-1]): + if not allow_nan: + raise ValueError('Values contains NaN and allow_nan=False') + nan_idx = np.searchsorted(uniques, np.nan) + uniques = uniques[:nan_idx+1] + + if return_inverse and nan_idx is not None: + inverse[inverse > nan_idx] = nan_idx + + if return_inverse: + return uniques, inverse + else: + return uniques + + +def _encode_numpy(values, uniques=None, encode=False, check_unknown=True, + allow_nan=False): # only used in _encode below, see docstring there for details + if uniques is None: if encode: - uniques, encoded = np.unique(values, return_inverse=True) + uniques, encoded = _nan_unique(values, return_inverse=True, + allow_nan=allow_nan) return uniques, encoded else: # unique sorts - return np.unique(values) + uniques = _nan_unique(values, allow_nan=allow_nan) + return uniques if encode: if check_unknown: - diff = _encode_check_unknown(values, uniques) + diff = _encode_check_unknown(values, uniques, allow_nan=allow_nan) if diff: raise ValueError("y contains previously unseen labels: %s" % str(diff)) @@ -54,15 +90,45 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True): return uniques -def _encode_python(values, uniques=None, encode=False): +class _DictWithNan(dict): + # dict which allows nan as a key + + def __init__(self): + self.nan_value = None + + def __getitem__(self, key): + if is_scalar_nan(key) and self.nan_value is not None: + return self.nan_value + else: + return self.__dict__[key] + + def __setitem__(self, key, item): + if is_scalar_nan(key): + self.nan_value = item + else: + self.__dict__[key] = item + + +def _encode_python(values, uniques=None, encode=False, allow_nan=False): # only used in _encode below, see docstring there for details if uniques is None: - uniques = sorted(set(values)) + missing_mask = _get_mask(values, np.nan) + if np.any(missing_mask): + if not allow_nan: + raise ValueError('Values contains NaN and allow_nan=False') + else: + # need np.sort to ensure nan is sorted last + uniques = np.sort(list(set(values[~missing_mask]) | {np.nan})) + else: + uniques = sorted(set(values)) uniques = np.array(uniques, dtype=values.dtype) if encode: - table = {val: i for i, val in enumerate(uniques)} + # hash is not enough to identify nan + table = _DictWithNan() + for i, val in enumerate(uniques): + table[val] = i try: - encoded = np.array([table[v] for v in values]) + encoded = np.array([table[val] for val in values]) except KeyError as e: raise ValueError("y contains previously unseen labels: %s" % str(e)) @@ -71,7 +137,8 @@ def _encode_python(values, uniques=None, encode=False): return uniques -def _encode(values, uniques=None, encode=False, check_unknown=True): +def _encode(values, uniques=None, encode=False, check_unknown=True, + allow_nan=False): """Helper function to factorize (find uniques) and encode values. Uses pure python method for object dtype, and numpy method for @@ -97,6 +164,9 @@ def _encode(values, uniques=None, encode=False, check_unknown=True): True in this case. This parameter is useful for _BaseEncoder._transform() to avoid calling _encode_check_unknown() twice. + allow_nan : bool, default False + if True, encode `np.nan` as another category. Otherwise raise an error + if nan are present Returns ------- @@ -109,16 +179,16 @@ def _encode(values, uniques=None, encode=False, check_unknown=True): """ if values.dtype == object: try: - res = _encode_python(values, uniques, encode) + res = _encode_python(values, uniques, encode, allow_nan) except TypeError: raise TypeError("argument must be a string or number") return res else: return _encode_numpy(values, uniques, encode, - check_unknown=check_unknown) + check_unknown, allow_nan) -def _encode_check_unknown(values, uniques, return_mask=False): +def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False): """ Helper function to check for unknowns in values to be encoded. @@ -134,6 +204,8 @@ def _encode_check_unknown(values, uniques, return_mask=False): return_mask : bool, default False If True, return a mask of the same shape as `values` indicating the valid values. + allow_nan : bool, default False + If False, raise an error if NaN are present. Returns ------- @@ -146,7 +218,22 @@ def _encode_check_unknown(values, uniques, return_mask=False): """ if values.dtype == object: uniques_set = set(uniques) - diff = list(set(values) - uniques_set) + values_set = set(values) + is_nan_in_value = any([is_scalar_nan(val) for val in values_set]) + if is_nan_in_value: + if not allow_nan: + raise ValueError('Values contains NaN') + if any(_get_mask(uniques, np.nan)): + diff = list(values_set - uniques_set) + if diff: + diff = np.array(diff) + diff = list(diff[~_get_mask(diff, np.nan)]) + else: + diff = list(values_set - uniques_set) + # diff = [] ### + else: + diff = list(values_set - uniques_set) + if return_mask: if diff: valid_mask = np.array([val in uniques_set for val in values]) @@ -157,9 +244,25 @@ def _encode_check_unknown(values, uniques, return_mask=False): return diff else: unique_values = np.unique(values) - diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True)) + mask_nan_in_values = _get_mask(unique_values, np.nan) + if np.any(mask_nan_in_values): + if not allow_nan: + raise ValueError('Values contains NaN') + else: + mask_nan_in_uniques = _get_mask(uniques, np.nan) + if np.any(mask_nan_in_uniques): + diff = np.setdiff1d(unique_values[~mask_nan_in_values], + uniques[~mask_nan_in_uniques], + assume_unique=True) + else: + diff = np.setdiff1d(unique_values, uniques, + assume_unique=True) + else: + diff = np.setdiff1d(unique_values, uniques, assume_unique=True) + diff = list(diff) + if return_mask: - if diff: + if len(diff): valid_mask = np.in1d(values, uniques) else: valid_mask = np.ones(len(values), dtype=bool) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 9b1dc2fc7f4a2..b6886e87d943c 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -10,6 +10,7 @@ from scipy.sparse import lil_matrix from sklearn.utils.multiclass import type_of_target +from sklearn.utils import is_scalar_nan from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_warns_message @@ -22,7 +23,10 @@ from sklearn.preprocessing.label import _inverse_binarize_thresholding from sklearn.preprocessing.label import _inverse_binarize_multiclass -from sklearn.preprocessing.label import _encode +from sklearn.preprocessing.label import _encode, _encode_numpy, _encode_python +from sklearn.preprocessing.label import _encode_check_unknown +from sklearn.preprocessing.label import _nan_unique +from sklearn.preprocessing.label import _DictWithNan from sklearn import datasets @@ -612,6 +616,7 @@ def test_inverse_binarize_multiclass(): assert_array_equal(got, np.array([1, 1, 0])) +@pytest.mark.parametrize("allow_nan", [True, False]) @pytest.mark.parametrize( "values, expected", [(np.array([2, 1, 3, 1, 3], dtype='int64'), @@ -621,32 +626,269 @@ def test_inverse_binarize_multiclass(): (np.array(['b', 'a', 'c', 'a', 'c']), np.array(['a', 'b', 'c']))], ids=['int64', 'object', 'str']) -def test_encode_util(values, expected): +def test_encode_util(values, expected, allow_nan): uniques = _encode(values) assert_array_equal(uniques, expected) - uniques, encoded = _encode(values, encode=True) + uniques, encoded = _encode(values, encode=True, allow_nan=allow_nan) assert_array_equal(uniques, expected) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) - _, encoded = _encode(values, uniques, encode=True) + _, encoded = _encode(values, uniques, encode=True, allow_nan=allow_nan) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) -def test_encode_check_unknown(): - # test for the check_unknown parameter of _encode() - uniques = np.array([1, 2, 3]) - values = np.array([1, 2, 3, 4]) +@pytest.mark.parametrize("values", + [np.asarray([np.nan, np.nan], dtype=float), + np.asarray([np.nan, np.nan], dtype=object)]) +def test_label_encode_raise_nan(values): + msg = 'Values contains NaN' + with pytest.raises(ValueError, match=msg): + _encode(values, allow_nan=False) + +@pytest.mark.parametrize("allow_nan", [True, False]) +@pytest.mark.parametrize( + "uniques, values", + [(np.array(['a', 'b', 'c'], dtype=object), + np.array(['a', 'b', 'c', 'd'], dtype=object)), + (np.array([], dtype=object), + np.array([1], dtype=object)), + (np.array([], dtype=float), + np.array([1], dtype=float)), + (np.array([1, 2, 3]), + np.array([1, 2, 3, 4]))]) +def test_encode_check_unknown(values, uniques, allow_nan): + # test for the check_unknown parameter of _encode() # Default is True, raise error with pytest.raises(ValueError, match='y contains previously unseen labels'): - _encode(values, uniques, encode=True, check_unknown=True) + _encode(values, uniques, encode=True, check_unknown=True, + allow_nan=allow_nan) # dont raise error if False - _encode(values, uniques, encode=True, check_unknown=False) + # check_unknown is always True for dtype object + if values.dtype != object: + _encode(values, uniques, encode=True, check_unknown=False, + allow_nan=allow_nan) + + +@pytest.mark.parametrize( + "uniques, values", + [(np.array([1, 2, 3]), + np.array([1, 2, 3, np.nan])), + (np.array([np.nan, 2, 3]), + np.array([np.nan, 2, 3, 4]))]) +def test_encode_check_unknown_nan_float(uniques, values): + # test for the check_unknown parameter of _encode() with nan present - # parameter is ignored for object dtype - uniques = np.array(['a', 'b', 'c'], dtype=object) - values = np.array(['a', 'b', 'c', 'd'], dtype=object) with pytest.raises(ValueError, - match='y contains previously unseen labels'): - _encode(values, uniques, encode=True, check_unknown=False) + match='y contains previously unseen label'): + _encode(values, uniques, encode=True, check_unknown=True, + allow_nan=True) + + # dont raise error if False + _encode(values, uniques, encode=True, check_unknown=False, allow_nan=True) + + +@pytest.mark.parametrize( + "uniques, values", + [(np.array(['a', 'b', 'c'], dtype=object), + np.array(['a', 'b', 'c', np.nan], dtype=object)), + (np.array([np.nan, 'b', 'c'], dtype=object), + np.array([np.nan, 'b', 'c', 'd'], dtype=object))]) +def test_encode_check_unknown_nan_object(uniques, values): + # test for the check_unknown parameter of _encode() with nan present + # parameter check_unknown is ignored for object dtype + with pytest.raises(ValueError, + match='y contains previously unseen label'): + _encode(values, uniques, encode=True, check_unknown=True, + allow_nan=True) + + +@pytest.mark.parametrize("return_mask", [True, False]) +@pytest.mark.parametrize( + "uniques, values", + [(np.array(['a', 'b', 'c'], dtype=object), + np.array(['a', 'b', 'c', np.nan], dtype=object)), + (np.array([np.nan, 'b', 'c'], dtype=object), + np.array([np.nan, 'b', 'c', 'd'], dtype=object)), + (np.array([1, 2, 3]), + np.array([1, 2, 3, np.nan])), + (np.array([np.nan, 2, 3]), + np.array([np.nan, 2, 3, 4]))]) +def test_check_unknown_nan_raise(uniques, values, return_mask): + # test for the check_unknown parameter of _encode() with nan present + + with pytest.raises(ValueError, + match='Values contains NaN'): + _encode_check_unknown(values, uniques, return_mask=return_mask, + allow_nan=False) + + +@pytest.mark.parametrize('allow_nan', [True, False]) +@pytest.mark.parametrize( + "values, uniques, diff, mask", + [(np.array(['a', 'a', 'a'], dtype=object), ['a'], [], [1, 1, 1]), + (np.array(['a', 'c', 'b'], dtype=object), ['a', 'b', 'c'], [], + [1, 1, 1]), + (np.array(['a', 'b', 'c', 'a', 'b'], dtype=object), ['a', 'b', 'c'], + [], [1, 1, 1, 1, 1]), + (np.array([1, 2, 3]), [1, 2, 3], [], [1, 1, 1]), + (np.array([1, 1, 1]), [1], [], [1, 1, 1]), + (np.array([1, 2, 3, 3, 2, 1]), [1, 2, 3], [], [1] * 6), + ]) +def test_encode_check_unknown_diff(values, uniques, diff, mask, allow_nan): + + diff_, mask_ = _encode_check_unknown(values, uniques, return_mask=True, + allow_nan=allow_nan) + assert_array_equal(diff, diff_) + assert_array_equal(mask, mask_) + + +@pytest.mark.parametrize( + "values, uniques, diff, mask", + [(np.array([1, 2, np.nan]), np.array([1, 2, np.nan]), [], [1, 1, 1]), + (np.array([1, 1, float('nan')]), np.array([1, np.nan]), + [], [1, 1, 1]), + (np.array([1, np.nan, 3, 3, 2, 1]), np.array([1, 2, 3, np.nan]), + [], [1] * 6), + ]) +def test_encode_check_unknown_diff_with_nan(values, uniques, diff, mask): + + diff_, mask_ = _encode_check_unknown(values, uniques, return_mask=True, + allow_nan=True) + assert_array_equal(diff, diff_) + assert_array_equal(mask, mask_) + + +def assert_array_equal_with_nan(x, y): + for a, b in zip(x, y): + if is_scalar_nan(a): + assert is_scalar_nan(b) + else: + assert a == b + + +@pytest.mark.parametrize( + "values, uniques, encoded", + [(np.array([4, np.nan, float('nan')]), [4, np.nan], + [0, 1, 1]), + (np.array([np.nan, float('nan')]), [np.nan], + [0, 0]), + (np.array([np.nan, 4, np.nan, 4]), [4, np.nan], + [1, 0, 1, 0]), + (np.array([np.nan]), [np.nan], [0]), + ]) +def test_label_encode_with_nan(values, uniques, encoded): + + assert_array_equal_with_nan(_encode(values, allow_nan=True), uniques) + + uniques_, encoded_ = _encode(values, encode=True, allow_nan=True) + assert_array_equal_with_nan(uniques, uniques_) + assert_array_equal_with_nan(encoded, encoded_) + + +@pytest.mark.parametrize( + "values, uniques, diff, mask", + [(np.array([1, 2, np.nan]), np.array([1, 2]), [np.nan], [1, 1, 0]), + (np.array([np.nan, float('nan')]), np.array([9]), [np.nan], [0, 0]), + (np.array([np.nan, 1, 1]), np.array([1]), [float('nan')], [0, 1, 1]), + (np.array([1, np.nan, 3, 3, 2, 1]), np.array([1, 2, 3]), + [], [1, 0, 1, 1, 1, 1]), + ]) +def test_encode_check_unknown_diff_nan_unseen(values, uniques, diff, mask): + + diff_, mask_ = _encode_check_unknown(values, uniques, return_mask=True, + allow_nan=True) + assert_array_equal_with_nan(mask, mask_) + assert_array_equal_with_nan(diff, diff_) + + +@pytest.mark.parametrize( + "values, unique, inverse", + [(np.array([]), [], []), + (np.array(['a', 'a', 'a'], dtype=object), ['a'], [0, 0, 0]), + (np.array(['a', 'c', 'b'], dtype=object), ['a', 'b', 'c'], [0, 2, 1]), + (np.array(['a', 'b', 'c', 'a', 'b'], dtype=object), ['a', 'b', 'c'], + [0, 1, 2, 0, 1]), + (np.array([1, 2, 3]), [1, 2, 3], [0, 1, 2]), + (np.array([1, 1, 1]), [1], [0, 0, 0]), + (np.array([1, 2, 3, 3, 2, 1]), [1, 2, 3], [0, 1, 2, 2, 1, 0]), + ]) +def test_nan_unique_same_as_np(values, unique, inverse): + #  assert _nan_unique == np.unique + + assert_array_equal(unique, _nan_unique(values)) + assert_array_equal(unique, np.unique(values)) + + u, i = _nan_unique(values, return_inverse=True) + assert_array_equal(unique, u) + assert_array_equal(inverse, i) + u, i = np.unique(values, return_inverse=True) + assert_array_equal(unique, u) + assert_array_equal(inverse, i) + + +@pytest.mark.parametrize( + "values, unique, inverse", + [(np.array([]), [], []), + (np.array([np.nan, np.nan, float('nan')]), [np.nan], [0, 0, 0]), + # (np.array([np.nan, 'a', 'a'], dtype=object), + # ['a', np.nan], [1, 0, 0]), + # (np.array([np.nan, 'c', 'b'], dtype=object), + # ['b', 'c', np.nan], [0, 2, 1]), + # (np.array([np.nan, 'b', 'c', 'a', 'b'], dtype=object), + # ['a', 'b', 'c', np.nan], [3, 1, 2, 0, 1]), + (np.array([np.nan, 2, 3]), [2, 3, np.nan], [2, 0, 1]), + (np.array([np.nan, 1, 1]), [1, np.nan], [1, 0, 0]), + (np.array([np.nan, 2, 3, 3, 2, 1]), [1, 2, 3, np.nan], + [3, 1, 2, 2, 1, 0]), + ]) +def test_nan_unique_nan(values, unique, inverse): + nan_unique, nan_inverse = _nan_unique(values, return_inverse=True, + allow_nan=True) + assert_array_equal_with_nan(nan_unique, unique) + assert_array_equal_with_nan(nan_inverse, inverse) + + +@pytest.mark.parametrize('encode_type', [_encode_numpy, _encode_python]) +@pytest.mark.parametrize( + ["values", "unique", "inverse"], + [(np.array([]), [], []), + (np.array([np.nan, np.nan, float('nan')]), [np.nan], [0, 0, 0]), + (np.array([np.nan, 2, 3]), [2, 3, np.nan], [2, 0, 1]), + (np.array([np.nan, 1, 1]), [1, np.nan], [1, 0, 0]), + (np.array([np.nan, 2, 3, 3, 2, 1]), [1, 2, 3, np.nan], + [3, 1, 2, 2, 1, 0]), + ]) +def test_nan_encode_numpy_python(values, unique, inverse, encode_type): + nan_unique, nan_inverse = encode_type(values, encode=True, allow_nan=True) + assert_array_equal_with_nan(nan_unique, unique) + assert_array_equal_with_nan(nan_inverse, inverse) + + # test also _nan_unique + nan_unique, nan_inverse = _nan_unique(values, return_inverse=True, + allow_nan=True) + assert_array_equal_with_nan(nan_unique, unique) + assert_array_equal_with_nan(nan_inverse, inverse) + + +def test_dict_with_nan(): + table = _DictWithNan() + table['a'] = 0 + table[42] = 42 + + with pytest.raises(KeyError): + table[np.nan] + with pytest.raises(KeyError): + table[float('nan')] + with pytest.raises(KeyError): + table['b'] + + table[np.nan] = 1 + assert table['a'] == 0 + assert table[42] == 42 + assert table[np.nan] == 1 + assert table[float('nan')] == 1 + + with pytest.raises(KeyError): + table[None]