Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[WIP] Handle missing values in label._encode() #15009

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 119 additions & 16 deletions sklearn/preprocessing/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@

from ..utils.sparsefuncs import min_max_axis
from ..utils import column_or_1d
from ..utils import is_scalar_nan
from ..utils.validation import check_array
from ..utils.validation import check_is_fitted
from ..utils.validation import _num_samples
from ..utils.multiclass import unique_labels
from ..utils.multiclass import type_of_target

from ..utils.mask import _get_mask

__all__ = [
'label_binarize',
Expand All @@ -33,18 +34,53 @@
]


def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
def _nan_unique(ar, return_inverse=False, allow_nan=False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think elsewhere in the repo we have an allow_nans parameter. Should probably keep the naming consistent (plural or singular)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not see any "allow_nans" parameter elsewhere. Did I miss something ?

""" mimic np.unique where all nan are treated as the same one

If allow_nan is False, ValueError is raise if ar contains nan.
Otherwise, if `ar` contains (possibly some) nan,
`uniques` will contains only one nan (contrary to np.unique), and
`inverse` will map all the nan from `ar` to this single nan in `uniques`.
"""

if return_inverse:
uniques, inverse = np.unique(ar, return_inverse=True)
else:
uniques = np.unique(ar)

nan_idx = None
# np.nan is always sorted last
if len(uniques) and is_scalar_nan(uniques[-1]):
if not allow_nan:
raise ValueError('Values contains NaN and allow_nan=False')
nan_idx = np.searchsorted(uniques, np.nan)
uniques = uniques[:nan_idx+1]

if return_inverse and nan_idx is not None:
inverse[inverse > nan_idx] = nan_idx

if return_inverse:
return uniques, inverse
else:
return uniques


def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
allow_nan=False):
# only used in _encode below, see docstring there for details

if uniques is None:
if encode:
uniques, encoded = np.unique(values, return_inverse=True)
uniques, encoded = _nan_unique(values, return_inverse=True,
allow_nan=allow_nan)
return uniques, encoded
else:
# unique sorts
return np.unique(values)
uniques = _nan_unique(values, allow_nan=allow_nan)
return uniques
if encode:
if check_unknown:
diff = _encode_check_unknown(values, uniques)
diff = _encode_check_unknown(values, uniques, allow_nan=allow_nan)
if diff:
raise ValueError("y contains previously unseen labels: %s"
% str(diff))
Expand All @@ -54,15 +90,45 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
return uniques


def _encode_python(values, uniques=None, encode=False):
class _DictWithNan(dict):
# dict which allows nan as a key

def __init__(self):
self.nan_value = None

def __getitem__(self, key):
if is_scalar_nan(key) and self.nan_value is not None:
return self.nan_value
else:
return self.__dict__[key]

def __setitem__(self, key, item):
if is_scalar_nan(key):
self.nan_value = item
else:
self.__dict__[key] = item


def _encode_python(values, uniques=None, encode=False, allow_nan=False):
# only used in _encode below, see docstring there for details
if uniques is None:
uniques = sorted(set(values))
missing_mask = _get_mask(values, np.nan)
if np.any(missing_mask):
if not allow_nan:
raise ValueError('Values contains NaN and allow_nan=False')
else:
# need np.sort to ensure nan is sorted last
uniques = np.sort(list(set(values[~missing_mask]) | {np.nan}))
else:
uniques = sorted(set(values))
uniques = np.array(uniques, dtype=values.dtype)
if encode:
table = {val: i for i, val in enumerate(uniques)}
# hash is not enough to identify nan
table = _DictWithNan()
for i, val in enumerate(uniques):
table[val] = i
try:
encoded = np.array([table[v] for v in values])
encoded = np.array([table[val] for val in values])
except KeyError as e:
raise ValueError("y contains previously unseen labels: %s"
% str(e))
Expand All @@ -71,7 +137,8 @@ def _encode_python(values, uniques=None, encode=False):
return uniques


def _encode(values, uniques=None, encode=False, check_unknown=True):
def _encode(values, uniques=None, encode=False, check_unknown=True,
allow_nan=False):
"""Helper function to factorize (find uniques) and encode values.

Uses pure python method for object dtype, and numpy method for
Expand All @@ -97,6 +164,9 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
True in this case. This parameter is useful for
_BaseEncoder._transform() to avoid calling _encode_check_unknown()
twice.
allow_nan : bool, default False
if True, encode `np.nan` as another category. Otherwise raise an error
if nan are present

Returns
-------
Expand All @@ -109,16 +179,16 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
"""
if values.dtype == object:
try:
res = _encode_python(values, uniques, encode)
res = _encode_python(values, uniques, encode, allow_nan)
except TypeError:
raise TypeError("argument must be a string or number")
return res
else:
return _encode_numpy(values, uniques, encode,
check_unknown=check_unknown)
check_unknown, allow_nan)


def _encode_check_unknown(values, uniques, return_mask=False):
def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
"""
Helper function to check for unknowns in values to be encoded.

Expand All @@ -134,6 +204,8 @@ def _encode_check_unknown(values, uniques, return_mask=False):
return_mask : bool, default False
If True, return a mask of the same shape as `values` indicating
the valid values.
allow_nan : bool, default False
If False, raise an error if NaN are present.

Returns
-------
Expand All @@ -146,7 +218,22 @@ def _encode_check_unknown(values, uniques, return_mask=False):
"""
if values.dtype == object:
uniques_set = set(uniques)
diff = list(set(values) - uniques_set)
values_set = set(values)
is_nan_in_value = any([is_scalar_nan(val) for val in values_set])
if is_nan_in_value:
if not allow_nan:
raise ValueError('Values contains NaN')
if any(_get_mask(uniques, np.nan)):
diff = list(values_set - uniques_set)
if diff:
diff = np.array(diff)
diff = list(diff[~_get_mask(diff, np.nan)])
else:
diff = list(values_set - uniques_set)
# diff = [] ###
else:
diff = list(values_set - uniques_set)

if return_mask:
if diff:
valid_mask = np.array([val in uniques_set for val in values])
Expand All @@ -157,9 +244,25 @@ def _encode_check_unknown(values, uniques, return_mask=False):
return diff
else:
unique_values = np.unique(values)
diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
mask_nan_in_values = _get_mask(unique_values, np.nan)
if np.any(mask_nan_in_values):
if not allow_nan:
raise ValueError('Values contains NaN')
else:
mask_nan_in_uniques = _get_mask(uniques, np.nan)
if np.any(mask_nan_in_uniques):
diff = np.setdiff1d(unique_values[~mask_nan_in_values],
uniques[~mask_nan_in_uniques],
assume_unique=True)
else:
diff = np.setdiff1d(unique_values, uniques,
assume_unique=True)
else:
diff = np.setdiff1d(unique_values, uniques, assume_unique=True)
diff = list(diff)

if return_mask:
if diff:
if len(diff):
valid_mask = np.in1d(values, uniques)
else:
valid_mask = np.ones(len(values), dtype=bool)
Expand Down
Loading