Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[WIP] Handle NaNs in OneHotEncoder #16749

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions doc/modules/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,18 @@ columns for this feature will be all zeros
array([[1., 0., 0., 0., 0., 0.]])


You can also specify the parameter ``handle_missing`` to deal with ``NaN`` during
either fitting or transforming. It is ``None`` by default which will raise
``ValueError``. When ``handle_missing='indicator'``, ``NaN`` will be replaced by a
separate one hot column. Whereas if ``handle_missing='ignore'``, the resulting one-hot
encoded columns for this feature will be all zeros (similar to ``handle_missing='ignore'``)

Note that, for scikit-learn to handle your missing values using OneHotEncoder,
you have to pass a placeholder of what should be recorded as a missing value.
This is the `missing_values` parameter and possible values can be either a
`NaN` or a custom value of your choice.


It is also possible to encode each column into ``n_categories - 1`` columns
instead of ``n_categories`` columns by using the ``drop`` parameter. This
parameter allows the user to specify a category for each feature to be dropped.
Expand Down
93 changes: 65 additions & 28 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@

from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array
from ..utils.fixes import _object_dtype_isnan
from ..utils.validation import check_is_fitted

from ._label import _encode, _encode_check_unknown


__all__ = [
'OneHotEncoder',
'OrdinalEncoder'
Expand Down Expand Up @@ -39,25 +38,25 @@ def _check_X(self, X):
"""
if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
# if not a dataframe, do normal check_array validation
X_temp = check_array(X, dtype=None)
X_temp = check_array(
X, dtype=None, force_all_finite=self.force_all_finite)
if (not hasattr(X, 'dtype')
and np.issubdtype(X_temp.dtype, np.str_)):
X = check_array(X, dtype=np.object)
X = check_array(X, dtype=np.object,
force_all_finite=self.force_all_finite)
else:
X = X_temp
needs_validation = False
else:
# pandas dataframe, do validation later column by column, in order
# to keep the dtype information to be used in the encoder.
needs_validation = True

# pandas dataframe, do validation later column by column, in order
# to keep the dtype information to be used in the encoder.

n_samples, n_features = X.shape
X_columns = []

for i in range(n_features):
Xi = self._get_feature(X, feature_idx=i)
Xi = check_array(Xi, ensure_2d=False, dtype=None,
force_all_finite=needs_validation)
force_all_finite=self.force_all_finite)
X_columns.append(Xi)

return X_columns, n_samples, n_features
Expand All @@ -69,7 +68,7 @@ def _get_feature(self, X, feature_idx):
# numpy arrays, sparse arrays
return X[:, feature_idx]

def _fit(self, X, handle_unknown='error'):
def _fit(self, X):
X_list, n_samples, n_features = self._check_X(X)

if self.categories != 'auto':
Expand All @@ -81,25 +80,38 @@ def _fit(self, X, handle_unknown='error'):

for i in range(n_features):
Xi = X_list[i]
# check the presence of NaNs during fit
nan_mask = _object_dtype_isnan(Xi)

if self.categories == 'auto':
cats = _encode(Xi)
# _encode(np.array(['a', 'b', np.nan], dtype='object'))
# throws TypeError
# add back np.nan later if handle_missing = 'indicator'
cats = _encode(Xi[~nan_mask])
else:
cats = np.array(self.categories[i], dtype=Xi.dtype)
if Xi.dtype != object:
if not np.all(np.sort(cats) == cats):
raise ValueError("Unsorted categories are not "
"supported for numerical categories")
if handle_unknown == 'error':
diff = _encode_check_unknown(Xi, cats)
if self.handle_unknown == 'error':
diff = _encode_check_unknown(Xi[~nan_mask], cats)
if diff:
msg = ("Found unknown categories {0} in column {1}"
" during fit".format(diff, i))
raise ValueError(msg)

if self.handle_missing == 'indicator' and np.any(nan_mask):
cats = np.append(cats, np.nan)

self.categories_.append(cats)

def _transform(self, X, handle_unknown='error'):
def _transform(self, X):
X_list, n_samples, n_features = self._check_X(X)

# from now on, either X is w.o. NaNs
# or w. NaNs yet handle_missing != None.
# since we'll handle NaNs separately so that it does not intefere
# with handle_unknown, we won't count NaNs as unknown categories
X_int = np.zeros((n_samples, n_features), dtype=np.int)
X_mask = np.ones((n_samples, n_features), dtype=np.bool)

Expand All @@ -115,17 +127,16 @@ def _transform(self, X, handle_unknown='error'):
Xi = X_list[i]
diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
return_mask=True)
# NaNs don't count as unknown categories
nan_valid_mask = valid_mask | _object_dtype_isnan(Xi)

if not np.all(valid_mask):
if handle_unknown == 'error':
if (not np.all(nan_valid_mask)
and self.handle_unknown == 'error'):
msg = ("Found unknown categories {0} in column {1}"
" during transform".format(diff, i))
raise ValueError(msg)
else:
# Set the problematic rows to an acceptable value and
# continue `The rows are marked `X_mask` and will be
# removed later.
X_mask[:, i] = valid_mask
# cast Xi into the largest string type necessary
# to handle different lengths of numpy strings
if (self.categories_[i].dtype.kind in ('U', 'S')
Expand All @@ -134,7 +145,15 @@ def _transform(self, X, handle_unknown='error'):
else:
Xi = Xi.copy()

if self.handle_missing == 'indicator':
valid_mask = nan_valid_mask
# handle_missing='ignore' and handle_error='ignore
# are essentially the same
Xi[~valid_mask] = self.categories_[i][0]
# Set the problematic rows to an acceptable value and
# continue `The rows are marked `X_mask` and will be
# removed later.
X_mask[:, i] = valid_mask
# We use check_unknown=False, since _encode_check_unknown was
# already called above.
_, encoded = _encode(Xi, self.categories_[i], encode=True,
Expand Down Expand Up @@ -219,6 +238,14 @@ class OneHotEncoder(_BaseEncoder):
will be all zeros. In the inverse transform, an unknown category
will be denoted as None.

handle_missing : {'indicator', 'ignore'}, default=None
Specify how to handle missing categorical features (NaN) in the
training data

- None : Raise an error in the presence of NaN (the default).
- 'indicator': Represent with a separate one-hot column.
- 'ignore': Replace with a row of zeros

Attributes
----------
categories_ : list of arrays
Expand Down Expand Up @@ -293,12 +320,15 @@ class OneHotEncoder(_BaseEncoder):
"""

def __init__(self, categories='auto', drop=None, sparse=True,
dtype=np.float64, handle_unknown='error'):
dtype=np.float64, handle_unknown='error',
handle_missing=None):
self.categories = categories
self.sparse = sparse
self.dtype = dtype
self.handle_unknown = handle_unknown
self.drop = drop
self.handle_missing = handle_missing
self.force_all_finite = True if handle_missing is None else 'allow-nan'

def _validate_keywords(self):
if self.handle_unknown not in ('error', 'ignore'):
Expand All @@ -321,8 +351,9 @@ def _compute_drop_idx(self):
if self.drop == 'first':
return np.zeros(len(self.categories_), dtype=np.object)
elif self.drop == 'if_binary':
return np.array([0 if len(cats) == 2 else None
for cats in self.categories_], dtype=np.object)
return np.array(
[0 if len(cats) == 2 else None
for cats in self.categories_], dtype=np.object)
else:
msg = (
"Wrong input for parameter `drop`. Expected "
Expand Down Expand Up @@ -378,7 +409,7 @@ def fit(self, X, y=None):
self
"""
self._validate_keywords()
self._fit(X, handle_unknown=self.handle_unknown)
self._fit(X)
self.drop_idx_ = self._compute_drop_idx()
return self

Expand Down Expand Up @@ -421,7 +452,7 @@ def transform(self, X):
"""
check_is_fitted(self)
# validation of X happens in _check_X called by _transform
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
X_int, X_mask = self._transform(X)

n_samples, n_features = X_int.shape

Expand Down Expand Up @@ -528,7 +559,8 @@ def inverse_transform(self, X):
# for sparse X argmax returns 2D matrix, ensure 1D array
labels = np.asarray(sub.argmax(axis=1)).flatten()
X_tr[:, i] = cats[labels]
if self.handle_unknown == 'ignore':
if (self.handle_unknown == 'ignore' or
self.handle_missing == 'ignore'):
unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
# ignored unknown categories: we have a row of all zero
if unknown.any():
Expand Down Expand Up @@ -653,9 +685,14 @@ class OrdinalEncoder(_BaseEncoder):
['Female', 2]], dtype=object)
"""

def __init__(self, categories='auto', dtype=np.float64):
def __init__(self, categories='auto', dtype=np.float64,
handle_unknown='error', handle_missing=None):
self.categories = categories
self.dtype = dtype
# TODO: handle unknown and missing for OrdinalEncoder
self.handle_unknown = handle_unknown
self.handle_missing = handle_missing
self.force_all_finite = True if handle_missing is None else 'allow-nan'

def fit(self, X, y=None):
"""
Expand Down
24 changes: 16 additions & 8 deletions sklearn/preprocessing/_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ..utils.validation import _num_samples
from ..utils.multiclass import unique_labels
from ..utils.multiclass import type_of_target
from ..utils.fixes import _object_dtype_isnan


__all__ = [
Expand Down Expand Up @@ -54,18 +55,24 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
return uniques


def _encode_python(values, uniques=None, encode=False):
def _encode_python(values, uniques=None, encode=False, check_unknown=True):
# only used in _encode below, see docstring there for details
if uniques is None:
uniques = sorted(set(values))
uniques = np.array(uniques, dtype=values.dtype)
uniques = np.array(uniques, dtype=values.dtype)
n_uniques = (~_object_dtype_isnan(uniques)).sum()
if encode:
table = {val: i for i, val in enumerate(uniques)}
try:
encoded = np.array([table[v] for v in values])
except KeyError as e:
raise ValueError("y contains previously unseen labels: %s"
% str(e))
if check_unknown:
try:
encoded = np.array([table[v] for v in values])
except KeyError as e:
raise ValueError("y contains previously unseen labels: %s"
% str(e))
else:
encoded = np.array(
[table[v] if v in table else n_uniques for v in values])

return uniques, encoded
else:
return uniques
Expand Down Expand Up @@ -109,7 +116,8 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
"""
if values.dtype == object:
try:
res = _encode_python(values, uniques, encode)
res = _encode_python(values, uniques, encode,
check_unknown=check_unknown)
except TypeError:
types = sorted(t.__qualname__
for t in set(type(v) for v in values))
Expand Down
65 changes: 61 additions & 4 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,63 @@ def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
ohe.transform(X)


@pytest.mark.parametrize("X", [
np.array([[1, 2, np.nan, 2]]).T,
np.array([['a', 'b', np.nan, 'b']], dtype=object).T],
ids=['numeric', 'object'])
@pytest.mark.parametrize("as_data_frame", [False, True],
ids=['array', 'dataframe'])
@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
@pytest.mark.parametrize("handle_missing, expected", [
('indicator', np.array(
[[1., 0., 0.], [0., 1., 0.], [0., 0., 1.], [0., 1., 0.]])),
('ignore', np.array([[1., 0.], [0., 1.], [0., 0.], [0., 1.]]))])
def test_one_hot_encoder_handle_missing(
X, as_data_frame, handle_unknown, handle_missing, expected):
if as_data_frame:
pd = pytest.importorskip('pandas')
X = pd.DataFrame(X)

enc = OneHotEncoder(
categories='auto', sparse=False,
handle_unknown=handle_unknown, handle_missing=handle_missing)
assert_array_equal(enc.fit_transform(X), expected)

exp_inv = enc.inverse_transform(expected)
# replace np.nan with None to compare
# if being more precise, handle_missing = 'ignore' will return None
# while handle_missing = 'indicator' will return NaN
exp_inv = np.array(exp_inv, dtype=object)
exp_inv[2, 0] = None
X_inv = np.array(X, dtype=object)
X_inv[2, 0] = None
assert_array_equal(exp_inv, X_inv)


@pytest.mark.parametrize("X_tr, X_ts", [
(np.array([[1, 2, 2]]).T, np.array([[1, np.nan]]).T),
(np.array([['a', 'b', 'b']], dtype=object).T,
np.array([['a', np.nan]], dtype=object).T)],
ids=['numeric', 'object'])
@pytest.mark.parametrize("as_data_frame", [False, True],
ids=['array', 'dataframe'])
@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
@pytest.mark.parametrize("handle_missing", ['indicator', 'ignore'])
def test_one_hot_encoder_handle_missing_transform(
X_tr, X_ts, as_data_frame, handle_unknown, handle_missing):
if as_data_frame:
pd = pytest.importorskip('pandas')
X_tr = pd.DataFrame(X_tr)

enc = OneHotEncoder(
categories='auto', sparse=False,
handle_unknown=handle_unknown, handle_missing=handle_missing).fit(X_tr)

exp = np.array([[1., 0.],
[0., 0.]])
assert_array_equal(enc.transform(X_ts), exp)


@pytest.mark.parametrize("X", [
[['abc', 2, 55], ['def', 1, 55]],
np.array([[10, 2, 55], [20, 1, 55]]),
Expand Down Expand Up @@ -637,15 +694,15 @@ def test_one_hot_encoder_drop_manual():
@pytest.mark.parametrize(
"X_fit, params, err_msg",
[([["Male"], ["Female"]], {'drop': 'second'},
"Wrong input for parameter `drop`"),
"Wrong input for parameter `drop`"),
([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
"`handle_unknown` must be 'error'"),
"`handle_unknown` must be 'error'"),
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
{'drop': np.asarray('b', dtype=object)},
"Wrong input for parameter `drop`"),
"Wrong input for parameter `drop`"),
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
{'drop': ['ghi', 3, 59]},
"The following categories were supposed")]
"The following categories were supposed")]
)
def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
enc = OneHotEncoder(**params)
Expand Down
4 changes: 3 additions & 1 deletion sklearn/preprocessing/tests/test_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,4 +653,6 @@ def test_encode_check_unknown():
values = np.array(['a', 'b', 'c', 'd'], dtype=object)
with pytest.raises(ValueError,
match='y contains previously unseen labels'):
_encode(values, uniques, encode=True, check_unknown=False)
_encode(values, uniques, encode=True, check_unknown=True)

_encode(values, uniques, encode=True, check_unknown=False)