scikit-learn · fmder · May 16, 2019 · May 16, 2019 · May 16, 2019 · May 17, 2019
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -481,8 +481,9 @@ new feature of integers (0 to n_categories - 1)::
 
     >>> enc = preprocessing.OrdinalEncoder()
     >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
-    >>> enc.fit(X)  # doctest: +ELLIPSIS
-    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
+    >>> enc.fit(X)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>,
+                   handle_unknown='error', unknown_category=None, unknown_value=-1)
     >>> enc.transform([['female', 'from US', 'uses Safari']])
     array([[0., 1., 1.]])
 
@@ -561,8 +562,8 @@ parameter allows the user to specify a category for each feature to be dropped.
 This is useful to avoid co-linearity in the input matrix in some classifiers.
 Such functionality is useful, for example, when using non-regularized
 regression (:class:`LinearRegression <sklearn.linear_model.LinearRegression>`),
-since co-linearity would cause the covariance matrix to be non-invertible. 
-When this paramenter is not None, ``handle_unknown`` must be set to 
+since co-linearity would cause the covariance matrix to be non-invertible.
+When this paramenter is not None, ``handle_unknown`` must be set to
 ``error``::
 
     >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -877,6 +877,24 @@ class OrdinalEncoder(_BaseEncoder):
     dtype : number type, default np.float64
         Desired dtype of output.
 
+    handle_unknown : 'error' or 'ignore', default='error'.
+        Whether to raise an error or ignore if an unknown categorical feature
+        is present during transform (default is to raise). When this parameter
+        is set to 'ignore' and an unknown category is encountered during
+        transform, a numeric ``-1`` category is returned. In the inverse
+        transform, an unknown category will be denoted as None.
+
+    unknown_category : object
+        Desired value that will take unknown value in
+        :meth:`~OrdinalEncoder.inverse_transform`.
+
+    unknown_value : integer
+        Desired value that will take unknown value in
+        :meth:`~OrdinalEncoder.transform`. Warning that if this value
+        is in :math:`[0, N-1]`, where N is the number of categories,
+        it will be indistiguishable from the original category.
+
+
     Attributes
     ----------
     categories_ : list of arrays
@@ -894,7 +912,8 @@ class OrdinalEncoder(_BaseEncoder):
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
     >>> enc.fit(X)
     ... # doctest: +ELLIPSIS
-    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
+    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
+                   handle_unknown='error', ...)
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
     >>> enc.transform([['Female', 3], ['Male', 1]])
@@ -905,6 +924,23 @@ class OrdinalEncoder(_BaseEncoder):
     array([['Male', 1],
            ['Female', 2]], dtype=object)
 
+    The ordinal encoding can be set to handle unknown categories at
+    transform time.
+
+    >>> from sklearn.preprocessing import OrdinalEncoder
+    >>> enc = OrdinalEncoder(handle_unknown='ignore')
+    >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
+    >>> enc.fit(X)
+    ... # doctest: +ELLIPSIS
+    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
+                   handle_unknown='ignore', ...)
+    >>> enc.transform([['Female', 4], ['Other', 1]])
+    array([[ 0., -1.],
+           [-1.,  0.]])
+    >>> enc.inverse_transform([[1, -1], [0, 1]])
+    array([['Male', None],
+           ['Female', 2]], dtype=object)
+
     See also
     --------
     sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
@@ -913,9 +949,14 @@ class OrdinalEncoder(_BaseEncoder):
       between 0 and n_classes-1.
     """
 
-    def __init__(self, categories='auto', dtype=np.float64):
+    def __init__(self, categories='auto', dtype=np.float64,
+                 handle_unknown='error', unknown_category=None,
+                 unknown_value=-1):
         self.categories = categories
         self.dtype = dtype
+        self.handle_unknown = handle_unknown
+        self.unknown_category = unknown_category
+        self.unknown_value = unknown_value
 
     def fit(self, X, y=None):
         """Fit the OrdinalEncoder to X.
@@ -933,7 +974,7 @@ def fit(self, X, y=None):
         # base classes uses _categories to deal with deprecations in
         # OneHoteEncoder: can be removed once deprecations are removed
         self._categories = self.categories
-        self._fit(X)
+        self._fit(X, handle_unknown=self.handle_unknown)
 
         return self
 
@@ -951,7 +992,8 @@ def transform(self, X):
             Transformed input.
 
         """
-        X_int, _ = self._transform(X)
+        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+        X_int[np.logical_not(X_mask)] = self.unknown_value
         return X_int.astype(self.dtype, copy=False)
 
     def inverse_transform(self, X):
@@ -988,6 +1030,10 @@ def inverse_transform(self, X):
             labels = X[:, i].astype('int64', copy=False)
             X_tr[:, i] = self.categories_[i][labels]
 
+            if self.handle_unknown == 'ignore':
+                unknown = labels == self.unknown_value
+                X_tr[unknown, i] = self.unknown_category
+
         return X_tr
 
     def _more_tags(self):

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -634,6 +634,22 @@ def test_ordinal_encoder(X):
     assert_array_equal(enc.fit_transform(X), exp)
 
 
+@pytest.mark.parametrize("Xfit,Xtrans", [
+    ([['abc', 2, 55], ['def', 1, 55]], [['ghi', 2, 55]]),
+    (np.array([[10, 2, 55], [20, 1, 55]]), np.array([[30, 2, 55]])),
+    (np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object),
+        np.array([['C', 'B', 'cat']], dtype=object))
+    ], ids=['mixed', 'numeric', 'object'])
+def test_ordinal_encoder_handle_unknown(Xfit, Xtrans):
+    enc = OrdinalEncoder(handle_unknown='ignore')
+    exp = np.array([[-1, 1, 0]], dtype='int64')
+    enc.fit_transform(Xfit)
+    assert_array_equal(enc.transform(Xtrans), exp.astype('float64'))
+    enc = OrdinalEncoder(dtype='int64', handle_unknown='ignore')
+    enc.fit(Xfit)
+    assert_array_equal(enc.transform(Xtrans), exp)
+
+
 @pytest.mark.parametrize("X, X2, cats, cat_dtype", [
     (np.array([['a', 'b']], dtype=object).T,
      np.array([['a', 'd']], dtype=object).T,
@@ -662,6 +678,26 @@ def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
         enc.fit(X2)
 
 
+@pytest.mark.parametrize("X, cats, cat_dtype", [
+    (np.array([['d', 'b']], dtype=object).T,
+     [['a', 'b', 'c']], np.object_),
+    (np.array([[4, 2]], dtype='int64').T,
+     [[1, 2, 3]], np.int64),
+    (np.array([['d', 'b']], dtype=object).T,
+     [np.array(['a', 'b', 'c'])], np.object_),
+    ], ids=['object', 'numeric', 'object-string-cat'])
+def test_ordinal_encoder_specified_categories_handle_unknown(X, cats,
+                                                             cat_dtype):
+    enc = OrdinalEncoder(categories=cats, handle_unknown='ignore')
+    exp = np.array([[-1.], [1.]])
+    assert_array_equal(enc.fit_transform(X), exp)
+    assert list(enc.categories[0]) == list(cats[0])
+    assert enc.categories_[0].tolist() == list(cats[0])
+    # manually specified categories should have same dtype as
+    # the data when coerced from lists
+    assert enc.categories_[0].dtype == cat_dtype
+
+
 def test_ordinal_encoder_inverse():
     X = [['abc', 2, 55], ['def', 1, 55]]
     enc = OrdinalEncoder()
@@ -675,6 +711,20 @@ def test_ordinal_encoder_inverse():
     assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
 
 
+def test_ordinal_encoder_inverse_handle_unknown():
+    X = [['abc', 2, 55], ['def', 1, 55]]
+    Xu = [['ghi', 2, 55]]
+    enc = OrdinalEncoder(handle_unknown='ignore')
+    X_tr = enc.fit(X).transform(Xu)
+    exp = np.array([[None, 2, 55]], dtype=object)
+    assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+    # incorrect shape raises
+    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
+    msg = re.escape('Shape of the passed X data is not correct')
+    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
+
+
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
                                np.array([['a', np.nan]], dtype=object).T],
                          ids=['numeric', 'object'])
@@ -703,6 +753,7 @@ def test_ordinal_encoder_raise_categories_shape():
     with pytest.raises(ValueError, match=msg):
         enc.fit(X)
 
+
 def test_encoder_dtypes():
     # check that dtypes are preserved when determining categories
     enc = OneHotEncoder(categories='auto')