Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG] Adds handle unknown option to ordinal encoder #13897

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions doc/modules/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -481,8 +481,9 @@ new feature of integers (0 to n_categories - 1)::

>>> enc = preprocessing.OrdinalEncoder()
>>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
>>> enc.fit(X) # doctest: +ELLIPSIS
OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
>>> enc.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>,
handle_unknown='error', unknown_category=None, unknown_value=-1)
>>> enc.transform([['female', 'from US', 'uses Safari']])
array([[0., 1., 1.]])

Expand Down Expand Up @@ -561,8 +562,8 @@ parameter allows the user to specify a category for each feature to be dropped.
This is useful to avoid co-linearity in the input matrix in some classifiers.
Such functionality is useful, for example, when using non-regularized
regression (:class:`LinearRegression <sklearn.linear_model.LinearRegression>`),
since co-linearity would cause the covariance matrix to be non-invertible.
When this paramenter is not None, ``handle_unknown`` must be set to
since co-linearity would cause the covariance matrix to be non-invertible.
When this paramenter is not None, ``handle_unknown`` must be set to
``error``::

>>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
Expand Down
54 changes: 50 additions & 4 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,24 @@ class OrdinalEncoder(_BaseEncoder):
dtype : number type, default np.float64
Desired dtype of output.

handle_unknown : 'error' or 'ignore', default='error'.
Whether to raise an error or ignore if an unknown categorical feature
is present during transform (default is to raise). When this parameter
is set to 'ignore' and an unknown category is encountered during
transform, a numeric ``-1`` category is returned. In the inverse
transform, an unknown category will be denoted as None.

unknown_category : object
Desired value that will take unknown value in
:meth:`~OrdinalEncoder.inverse_transform`.

unknown_value : integer
Desired value that will take unknown value in
:meth:`~OrdinalEncoder.transform`. Warning that if this value
is in :math:`[0, N-1]`, where N is the number of categories,
it will be indistiguishable from the original category.


Attributes
----------
categories_ : list of arrays
Expand All @@ -894,7 +912,8 @@ class OrdinalEncoder(_BaseEncoder):
>>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
>>> enc.fit(X)
... # doctest: +ELLIPSIS
OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
handle_unknown='error', ...)
>>> enc.categories_
[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
>>> enc.transform([['Female', 3], ['Male', 1]])
Expand All @@ -905,6 +924,23 @@ class OrdinalEncoder(_BaseEncoder):
array([['Male', 1],
['Female', 2]], dtype=object)

The ordinal encoding can be set to handle unknown categories at
transform time.

>>> from sklearn.preprocessing import OrdinalEncoder
>>> enc = OrdinalEncoder(handle_unknown='ignore')
>>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
>>> enc.fit(X)
... # doctest: +ELLIPSIS
OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
handle_unknown='ignore', ...)
>>> enc.transform([['Female', 4], ['Other', 1]])
array([[ 0., -1.],
[-1., 0.]])
>>> enc.inverse_transform([[1, -1], [0, 1]])
array([['Male', None],
['Female', 2]], dtype=object)

See also
--------
sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
Expand All @@ -913,9 +949,14 @@ class OrdinalEncoder(_BaseEncoder):
between 0 and n_classes-1.
"""

def __init__(self, categories='auto', dtype=np.float64):
def __init__(self, categories='auto', dtype=np.float64,
handle_unknown='error', unknown_category=None,
unknown_value=-1):
self.categories = categories
self.dtype = dtype
self.handle_unknown = handle_unknown
self.unknown_category = unknown_category
self.unknown_value = unknown_value

def fit(self, X, y=None):
"""Fit the OrdinalEncoder to X.
Expand All @@ -933,7 +974,7 @@ def fit(self, X, y=None):
# base classes uses _categories to deal with deprecations in
# OneHoteEncoder: can be removed once deprecations are removed
self._categories = self.categories
self._fit(X)
self._fit(X, handle_unknown=self.handle_unknown)

return self

Expand All @@ -951,7 +992,8 @@ def transform(self, X):
Transformed input.

"""
X_int, _ = self._transform(X)
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
X_int[np.logical_not(X_mask)] = self.unknown_value
return X_int.astype(self.dtype, copy=False)

def inverse_transform(self, X):
Expand Down Expand Up @@ -988,6 +1030,10 @@ def inverse_transform(self, X):
labels = X[:, i].astype('int64', copy=False)
X_tr[:, i] = self.categories_[i][labels]

if self.handle_unknown == 'ignore':
unknown = labels == self.unknown_value
X_tr[unknown, i] = self.unknown_category

return X_tr

def _more_tags(self):
Expand Down
51 changes: 51 additions & 0 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,22 @@ def test_ordinal_encoder(X):
assert_array_equal(enc.fit_transform(X), exp)


@pytest.mark.parametrize("Xfit,Xtrans", [
([['abc', 2, 55], ['def', 1, 55]], [['ghi', 2, 55]]),
(np.array([[10, 2, 55], [20, 1, 55]]), np.array([[30, 2, 55]])),
(np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object),
np.array([['C', 'B', 'cat']], dtype=object))
], ids=['mixed', 'numeric', 'object'])
def test_ordinal_encoder_handle_unknown(Xfit, Xtrans):
enc = OrdinalEncoder(handle_unknown='ignore')
exp = np.array([[-1, 1, 0]], dtype='int64')
enc.fit_transform(Xfit)
assert_array_equal(enc.transform(Xtrans), exp.astype('float64'))
enc = OrdinalEncoder(dtype='int64', handle_unknown='ignore')
enc.fit(Xfit)
assert_array_equal(enc.transform(Xtrans), exp)


@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
(np.array([['a', 'b']], dtype=object).T,
np.array([['a', 'd']], dtype=object).T,
Expand Down Expand Up @@ -662,6 +678,26 @@ def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
enc.fit(X2)


@pytest.mark.parametrize("X, cats, cat_dtype", [
(np.array([['d', 'b']], dtype=object).T,
[['a', 'b', 'c']], np.object_),
(np.array([[4, 2]], dtype='int64').T,
[[1, 2, 3]], np.int64),
(np.array([['d', 'b']], dtype=object).T,
[np.array(['a', 'b', 'c'])], np.object_),
], ids=['object', 'numeric', 'object-string-cat'])
def test_ordinal_encoder_specified_categories_handle_unknown(X, cats,
cat_dtype):
enc = OrdinalEncoder(categories=cats, handle_unknown='ignore')
exp = np.array([[-1.], [1.]])
assert_array_equal(enc.fit_transform(X), exp)
assert list(enc.categories[0]) == list(cats[0])
assert enc.categories_[0].tolist() == list(cats[0])
# manually specified categories should have same dtype as
# the data when coerced from lists
assert enc.categories_[0].dtype == cat_dtype


def test_ordinal_encoder_inverse():
X = [['abc', 2, 55], ['def', 1, 55]]
enc = OrdinalEncoder()
Expand All @@ -675,6 +711,20 @@ def test_ordinal_encoder_inverse():
assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)


def test_ordinal_encoder_inverse_handle_unknown():
X = [['abc', 2, 55], ['def', 1, 55]]
Xu = [['ghi', 2, 55]]
enc = OrdinalEncoder(handle_unknown='ignore')
X_tr = enc.fit(X).transform(Xu)
exp = np.array([[None, 2, 55]], dtype=object)
assert_array_equal(enc.inverse_transform(X_tr), exp)

# incorrect shape raises
X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
msg = re.escape('Shape of the passed X data is not correct')
assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)


@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
np.array([['a', np.nan]], dtype=object).T],
ids=['numeric', 'object'])
Expand Down Expand Up @@ -703,6 +753,7 @@ def test_ordinal_encoder_raise_categories_shape():
with pytest.raises(ValueError, match=msg):
enc.fit(X)


def test_encoder_dtypes():
# check that dtypes are preserved when determining categories
enc = OneHotEncoder(categories='auto')
Expand Down