diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4c68f9e635498..b43a4d6871b77 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -481,8 +481,9 @@ new feature of integers (0 to n_categories - 1):: >>> enc = preprocessing.OrdinalEncoder() >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] - >>> enc.fit(X) # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) + >>> enc.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + OrdinalEncoder(categories='auto', dtype=, + handle_unknown='error', unknown_category=None, unknown_value=-1) >>> enc.transform([['female', 'from US', 'uses Safari']]) array([[0., 1., 1.]]) @@ -561,8 +562,8 @@ parameter allows the user to specify a category for each feature to be dropped. This is useful to avoid co-linearity in the input matrix in some classifiers. Such functionality is useful, for example, when using non-regularized regression (:class:`LinearRegression `), -since co-linearity would cause the covariance matrix to be non-invertible. -When this paramenter is not None, ``handle_unknown`` must be set to +since co-linearity would cause the covariance matrix to be non-invertible. +When this paramenter is not None, ``handle_unknown`` must be set to ``error``:: >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 6d11e7907984a..215b00bc1bfe9 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -877,6 +877,24 @@ class OrdinalEncoder(_BaseEncoder): dtype : number type, default np.float64 Desired dtype of output. + handle_unknown : 'error' or 'ignore', default='error'. + Whether to raise an error or ignore if an unknown categorical feature + is present during transform (default is to raise). When this parameter + is set to 'ignore' and an unknown category is encountered during + transform, a numeric ``-1`` category is returned. In the inverse + transform, an unknown category will be denoted as None. + + unknown_category : object + Desired value that will take unknown value in + :meth:`~OrdinalEncoder.inverse_transform`. + + unknown_value : integer + Desired value that will take unknown value in + :meth:`~OrdinalEncoder.transform`. Warning that if this value + is in :math:`[0, N-1]`, where N is the number of categories, + it will be indistiguishable from the original category. + + Attributes ---------- categories_ : list of arrays @@ -894,7 +912,8 @@ class OrdinalEncoder(_BaseEncoder): >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) ... # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, + handle_unknown='error', ...) >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 3], ['Male', 1]]) @@ -905,6 +924,23 @@ class OrdinalEncoder(_BaseEncoder): array([['Male', 1], ['Female', 2]], dtype=object) + The ordinal encoding can be set to handle unknown categories at + transform time. + + >>> from sklearn.preprocessing import OrdinalEncoder + >>> enc = OrdinalEncoder(handle_unknown='ignore') + >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] + >>> enc.fit(X) + ... # doctest: +ELLIPSIS + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, + handle_unknown='ignore', ...) + >>> enc.transform([['Female', 4], ['Other', 1]]) + array([[ 0., -1.], + [-1., 0.]]) + >>> enc.inverse_transform([[1, -1], [0, 1]]) + array([['Male', None], + ['Female', 2]], dtype=object) + See also -------- sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of @@ -913,9 +949,14 @@ class OrdinalEncoder(_BaseEncoder): between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64): + def __init__(self, categories='auto', dtype=np.float64, + handle_unknown='error', unknown_category=None, + unknown_value=-1): self.categories = categories self.dtype = dtype + self.handle_unknown = handle_unknown + self.unknown_category = unknown_category + self.unknown_value = unknown_value def fit(self, X, y=None): """Fit the OrdinalEncoder to X. @@ -933,7 +974,7 @@ def fit(self, X, y=None): # base classes uses _categories to deal with deprecations in # OneHoteEncoder: can be removed once deprecations are removed self._categories = self.categories - self._fit(X) + self._fit(X, handle_unknown=self.handle_unknown) return self @@ -951,7 +992,8 @@ def transform(self, X): Transformed input. """ - X_int, _ = self._transform(X) + X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) + X_int[np.logical_not(X_mask)] = self.unknown_value return X_int.astype(self.dtype, copy=False) def inverse_transform(self, X): @@ -988,6 +1030,10 @@ def inverse_transform(self, X): labels = X[:, i].astype('int64', copy=False) X_tr[:, i] = self.categories_[i][labels] + if self.handle_unknown == 'ignore': + unknown = labels == self.unknown_value + X_tr[unknown, i] = self.unknown_category + return X_tr def _more_tags(self): diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index db201dcd58c15..eb4ea2ab5eee7 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -634,6 +634,22 @@ def test_ordinal_encoder(X): assert_array_equal(enc.fit_transform(X), exp) +@pytest.mark.parametrize("Xfit,Xtrans", [ + ([['abc', 2, 55], ['def', 1, 55]], [['ghi', 2, 55]]), + (np.array([[10, 2, 55], [20, 1, 55]]), np.array([[30, 2, 55]])), + (np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object), + np.array([['C', 'B', 'cat']], dtype=object)) + ], ids=['mixed', 'numeric', 'object']) +def test_ordinal_encoder_handle_unknown(Xfit, Xtrans): + enc = OrdinalEncoder(handle_unknown='ignore') + exp = np.array([[-1, 1, 0]], dtype='int64') + enc.fit_transform(Xfit) + assert_array_equal(enc.transform(Xtrans), exp.astype('float64')) + enc = OrdinalEncoder(dtype='int64', handle_unknown='ignore') + enc.fit(Xfit) + assert_array_equal(enc.transform(Xtrans), exp) + + @pytest.mark.parametrize("X, X2, cats, cat_dtype", [ (np.array([['a', 'b']], dtype=object).T, np.array([['a', 'd']], dtype=object).T, @@ -662,6 +678,26 @@ def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype): enc.fit(X2) +@pytest.mark.parametrize("X, cats, cat_dtype", [ + (np.array([['d', 'b']], dtype=object).T, + [['a', 'b', 'c']], np.object_), + (np.array([[4, 2]], dtype='int64').T, + [[1, 2, 3]], np.int64), + (np.array([['d', 'b']], dtype=object).T, + [np.array(['a', 'b', 'c'])], np.object_), + ], ids=['object', 'numeric', 'object-string-cat']) +def test_ordinal_encoder_specified_categories_handle_unknown(X, cats, + cat_dtype): + enc = OrdinalEncoder(categories=cats, handle_unknown='ignore') + exp = np.array([[-1.], [1.]]) + assert_array_equal(enc.fit_transform(X), exp) + assert list(enc.categories[0]) == list(cats[0]) + assert enc.categories_[0].tolist() == list(cats[0]) + # manually specified categories should have same dtype as + # the data when coerced from lists + assert enc.categories_[0].dtype == cat_dtype + + def test_ordinal_encoder_inverse(): X = [['abc', 2, 55], ['def', 1, 55]] enc = OrdinalEncoder() @@ -675,6 +711,20 @@ def test_ordinal_encoder_inverse(): assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) +def test_ordinal_encoder_inverse_handle_unknown(): + X = [['abc', 2, 55], ['def', 1, 55]] + Xu = [['ghi', 2, 55]] + enc = OrdinalEncoder(handle_unknown='ignore') + X_tr = enc.fit(X).transform(Xu) + exp = np.array([[None, 2, 55]], dtype=object) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # incorrect shape raises + X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) + msg = re.escape('Shape of the passed X data is not correct') + assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) + + @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, np.array([['a', np.nan]], dtype=object).T], ids=['numeric', 'object']) @@ -703,6 +753,7 @@ def test_ordinal_encoder_raise_categories_shape(): with pytest.raises(ValueError, match=msg): enc.fit(X) + def test_encoder_dtypes(): # check that dtypes are preserved when determining categories enc = OneHotEncoder(categories='auto')