scikit-learn · nilichen · Sep 6, 2018 · Sep 6, 2018 · Sep 8, 2018 · Sep 8, 2018
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -553,6 +553,18 @@ columns for this feature will be all zeros
     array([[1., 0., 0., 0., 0., 0.]])
 
 
+You can also specify the parameter ``handle_missing`` to deal with ``NaN`` during 
+either fitting or transforming. It is ``None`` by default which will raise 
+``ValueError``. When ``handle_missing='indicator'``, ``NaN`` will be replaced by a 
+separate one hot column. Whereas if ``handle_missing='ignore'``, the resulting one-hot 
+encoded columns for this feature will be all zeros (similar to ``handle_missing='ignore'``) 
+
+Note that, for scikit-learn to handle your missing values using OneHotEncoder,
+you have to pass a placeholder of what should be recorded as a missing value.
+This is the `missing_values` parameter and possible values can be either a
+`NaN` or a custom value of your choice.
+
+
 It is also possible to encode each column into ``n_categories - 1`` columns
 instead of ``n_categories`` columns by using the ``drop`` parameter. This
 parameter allows the user to specify a category for each feature to be dropped.

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -7,11 +7,10 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
+from ..utils.fixes import _object_dtype_isnan
 from ..utils.validation import check_is_fitted
-
 from ._label import _encode, _encode_check_unknown
 
-
 __all__ = [
     'OneHotEncoder',
     'OrdinalEncoder'
@@ -39,25 +38,25 @@ def _check_X(self, X):
         """
         if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
             # if not a dataframe, do normal check_array validation
-            X_temp = check_array(X, dtype=None)
+            X_temp = check_array(
+                X, dtype=None, force_all_finite=self.force_all_finite)
             if (not hasattr(X, 'dtype')
                     and np.issubdtype(X_temp.dtype, np.str_)):
-                X = check_array(X, dtype=np.object)
+                X = check_array(X, dtype=np.object,
+                                force_all_finite=self.force_all_finite)
             else:
                 X = X_temp
-            needs_validation = False
-        else:
-            # pandas dataframe, do validation later column by column, in order
-            # to keep the dtype information to be used in the encoder.
-            needs_validation = True
+
+        # pandas dataframe, do validation later column by column, in order
+        # to keep the dtype information to be used in the encoder.
 
         n_samples, n_features = X.shape
         X_columns = []
 
         for i in range(n_features):
             Xi = self._get_feature(X, feature_idx=i)
             Xi = check_array(Xi, ensure_2d=False, dtype=None,
-                             force_all_finite=needs_validation)
+                             force_all_finite=self.force_all_finite)
             X_columns.append(Xi)
 
         return X_columns, n_samples, n_features
@@ -69,7 +68,7 @@ def _get_feature(self, X, feature_idx):
         # numpy arrays, sparse arrays
         return X[:, feature_idx]
 
-    def _fit(self, X, handle_unknown='error'):
+    def _fit(self, X):
         X_list, n_samples, n_features = self._check_X(X)
 
         if self.categories != 'auto':
@@ -81,25 +80,38 @@ def _fit(self, X, handle_unknown='error'):
 
         for i in range(n_features):
             Xi = X_list[i]
+            # check the presence of NaNs during fit
+            nan_mask = _object_dtype_isnan(Xi)
+
             if self.categories == 'auto':
-                cats = _encode(Xi)
+                # _encode(np.array(['a', 'b', np.nan], dtype='object'))
+                # throws TypeError
+                # add back np.nan later if handle_missing = 'indicator'
+                cats = _encode(Xi[~nan_mask])
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
                 if Xi.dtype != object:
                     if not np.all(np.sort(cats) == cats):
                         raise ValueError("Unsorted categories are not "
                                          "supported for numerical categories")
-                if handle_unknown == 'error':
-                    diff = _encode_check_unknown(Xi, cats)
+                if self.handle_unknown == 'error':
+                    diff = _encode_check_unknown(Xi[~nan_mask], cats)
                     if diff:
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
+
+            if self.handle_missing == 'indicator' and np.any(nan_mask):
+                cats = np.append(cats, np.nan)
+
             self.categories_.append(cats)
 
-    def _transform(self, X, handle_unknown='error'):
+    def _transform(self, X):
         X_list, n_samples, n_features = self._check_X(X)
-
+        # from now on, either X is w.o. NaNs
+        #              or w. NaNs yet handle_missing != None.
+        # since we'll handle NaNs separately so that it does not intefere
+        # with handle_unknown, we won't count NaNs as unknown categories
         X_int = np.zeros((n_samples, n_features), dtype=np.int)
         X_mask = np.ones((n_samples, n_features), dtype=np.bool)
 
@@ -115,17 +127,16 @@ def _transform(self, X, handle_unknown='error'):
             Xi = X_list[i]
             diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
                                                      return_mask=True)
+            # NaNs don't count as unknown categories
+            nan_valid_mask = valid_mask | _object_dtype_isnan(Xi)
 
             if not np.all(valid_mask):
-                if handle_unknown == 'error':
+                if (not np.all(nan_valid_mask)
+                        and self.handle_unknown == 'error'):
                     msg = ("Found unknown categories {0} in column {1}"
                            " during transform".format(diff, i))
                     raise ValueError(msg)
                 else:
-                    # Set the problematic rows to an acceptable value and
-                    # continue `The rows are marked `X_mask` and will be
-                    # removed later.
-                    X_mask[:, i] = valid_mask
                     # cast Xi into the largest string type necessary
                     # to handle different lengths of numpy strings
                     if (self.categories_[i].dtype.kind in ('U', 'S')
@@ -134,7 +145,15 @@ def _transform(self, X, handle_unknown='error'):
                     else:
                         Xi = Xi.copy()
 
+                    if self.handle_missing == 'indicator':
+                        valid_mask = nan_valid_mask
+                    # handle_missing='ignore' and handle_error='ignore
+                    # are essentially the same
                     Xi[~valid_mask] = self.categories_[i][0]
+                    # Set the problematic rows to an acceptable value and
+                    # continue `The rows are marked `X_mask` and will be
+                    # removed later.
+                    X_mask[:, i] = valid_mask
             # We use check_unknown=False, since _encode_check_unknown was
             # already called above.
             _, encoded = _encode(Xi, self.categories_[i], encode=True,
@@ -219,6 +238,14 @@ class OneHotEncoder(_BaseEncoder):
         will be all zeros. In the inverse transform, an unknown category
         will be denoted as None.
 
+    handle_missing : {'indicator', 'ignore'}, default=None
+        Specify how to handle missing categorical features (NaN) in the
+        training data
+
+        - None : Raise an error in the presence of NaN (the default).
+        - 'indicator': Represent with a separate one-hot column.
+        - 'ignore': Replace with a row of zeros
+
     Attributes
     ----------
     categories_ : list of arrays
@@ -293,12 +320,15 @@ class OneHotEncoder(_BaseEncoder):
     """
 
     def __init__(self, categories='auto', drop=None, sparse=True,
-                 dtype=np.float64, handle_unknown='error'):
+                 dtype=np.float64, handle_unknown='error',
+                 handle_missing=None):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
         self.handle_unknown = handle_unknown
         self.drop = drop
+        self.handle_missing = handle_missing
+        self.force_all_finite = True if handle_missing is None else 'allow-nan'
 
     def _validate_keywords(self):
         if self.handle_unknown not in ('error', 'ignore'):
@@ -321,8 +351,9 @@ def _compute_drop_idx(self):
             if self.drop == 'first':
                 return np.zeros(len(self.categories_), dtype=np.object)
             elif self.drop == 'if_binary':
-                return np.array([0 if len(cats) == 2 else None
-                                for cats in self.categories_], dtype=np.object)
+                return np.array(
+                    [0 if len(cats) == 2 else None
+                     for cats in self.categories_], dtype=np.object)
             else:
                 msg = (
                     "Wrong input for parameter `drop`. Expected "
@@ -378,7 +409,7 @@ def fit(self, X, y=None):
         self
         """
         self._validate_keywords()
-        self._fit(X, handle_unknown=self.handle_unknown)
+        self._fit(X)
         self.drop_idx_ = self._compute_drop_idx()
         return self
 
@@ -421,7 +452,7 @@ def transform(self, X):
         """
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
-        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+        X_int, X_mask = self._transform(X)
 
         n_samples, n_features = X_int.shape
 
@@ -528,7 +559,8 @@ def inverse_transform(self, X):
             # for sparse X argmax returns 2D matrix, ensure 1D array
             labels = np.asarray(sub.argmax(axis=1)).flatten()
             X_tr[:, i] = cats[labels]
-            if self.handle_unknown == 'ignore':
+            if (self.handle_unknown == 'ignore' or
+                    self.handle_missing == 'ignore'):
                 unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
                 # ignored unknown categories: we have a row of all zero
                 if unknown.any():
@@ -653,9 +685,14 @@ class OrdinalEncoder(_BaseEncoder):
            ['Female', 2]], dtype=object)
     """
 
-    def __init__(self, categories='auto', dtype=np.float64):
+    def __init__(self, categories='auto', dtype=np.float64,
+                 handle_unknown='error', handle_missing=None):
         self.categories = categories
         self.dtype = dtype
+        # TODO: handle unknown and missing for OrdinalEncoder
+        self.handle_unknown = handle_unknown
+        self.handle_missing = handle_missing
+        self.force_all_finite = True if handle_missing is None else 'allow-nan'
 
     def fit(self, X, y=None):
         """

diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
@@ -23,6 +23,7 @@
 from ..utils.validation import _num_samples
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
+from ..utils.fixes import _object_dtype_isnan
 
 
 __all__ = [
@@ -54,18 +55,24 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
         return uniques
 
 
-def _encode_python(values, uniques=None, encode=False):
+def _encode_python(values, uniques=None, encode=False, check_unknown=True):
     # only used in _encode below, see docstring there for details
     if uniques is None:
         uniques = sorted(set(values))
-        uniques = np.array(uniques, dtype=values.dtype)
+    uniques = np.array(uniques, dtype=values.dtype)
+    n_uniques = (~_object_dtype_isnan(uniques)).sum()
     if encode:
         table = {val: i for i, val in enumerate(uniques)}
-        try:
-            encoded = np.array([table[v] for v in values])
-        except KeyError as e:
-            raise ValueError("y contains previously unseen labels: %s"
-                             % str(e))
+        if check_unknown:
+            try:
+                encoded = np.array([table[v] for v in values])
+            except KeyError as e:
+                raise ValueError("y contains previously unseen labels: %s"
+                                 % str(e))
+        else:
+            encoded = np.array(
+                [table[v] if v in table else n_uniques for v in values])
+
         return uniques, encoded
     else:
         return uniques
@@ -109,7 +116,8 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
     """
     if values.dtype == object:
         try:
-            res = _encode_python(values, uniques, encode)
+            res = _encode_python(values, uniques, encode,
+                                 check_unknown=check_unknown)
         except TypeError:
             types = sorted(t.__qualname__
                            for t in set(type(v) for v in values))

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -479,6 +479,63 @@ def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
         ohe.transform(X)
 
 
+@pytest.mark.parametrize("X", [
+    np.array([[1, 2, np.nan, 2]]).T,
+    np.array([['a', 'b', np.nan, 'b']], dtype=object).T],
+                         ids=['numeric', 'object'])
+@pytest.mark.parametrize("as_data_frame", [False, True],
+                         ids=['array', 'dataframe'])
+@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
+@pytest.mark.parametrize("handle_missing, expected", [
+    ('indicator', np.array(
+        [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.], [0., 1., 0.]])),
+    ('ignore', np.array([[1., 0.], [0., 1.], [0., 0.], [0., 1.]]))])
+def test_one_hot_encoder_handle_missing(
+        X, as_data_frame, handle_unknown, handle_missing, expected):
+    if as_data_frame:
+        pd = pytest.importorskip('pandas')
+        X = pd.DataFrame(X)
+
+    enc = OneHotEncoder(
+        categories='auto', sparse=False,
+        handle_unknown=handle_unknown, handle_missing=handle_missing)
+    assert_array_equal(enc.fit_transform(X), expected)
+
+    exp_inv = enc.inverse_transform(expected)
+    # replace np.nan with None to compare
+    # if being more precise, handle_missing = 'ignore' will return None
+    #                  while handle_missing = 'indicator' will return NaN
+    exp_inv = np.array(exp_inv, dtype=object)
+    exp_inv[2, 0] = None
+    X_inv = np.array(X, dtype=object)
+    X_inv[2, 0] = None
+    assert_array_equal(exp_inv, X_inv)
+
+
+@pytest.mark.parametrize("X_tr, X_ts", [
+    (np.array([[1, 2, 2]]).T, np.array([[1, np.nan]]).T),
+    (np.array([['a', 'b', 'b']], dtype=object).T,
+     np.array([['a', np.nan]], dtype=object).T)],
+                         ids=['numeric', 'object'])
+@pytest.mark.parametrize("as_data_frame", [False, True],
+                         ids=['array', 'dataframe'])
+@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
+@pytest.mark.parametrize("handle_missing", ['indicator', 'ignore'])
+def test_one_hot_encoder_handle_missing_transform(
+        X_tr, X_ts, as_data_frame, handle_unknown, handle_missing):
+    if as_data_frame:
+        pd = pytest.importorskip('pandas')
+        X_tr = pd.DataFrame(X_tr)
+
+    enc = OneHotEncoder(
+        categories='auto', sparse=False,
+        handle_unknown=handle_unknown, handle_missing=handle_missing).fit(X_tr)
+
+    exp = np.array([[1., 0.],
+                    [0., 0.]])
+    assert_array_equal(enc.transform(X_ts), exp)
+
+
 @pytest.mark.parametrize("X", [
     [['abc', 2, 55], ['def', 1, 55]],
     np.array([[10, 2, 55], [20, 1, 55]]),
@@ -637,15 +694,15 @@ def test_one_hot_encoder_drop_manual():
 @pytest.mark.parametrize(
     "X_fit, params, err_msg",
     [([["Male"], ["Female"]], {'drop': 'second'},
-     "Wrong input for parameter `drop`"),
+      "Wrong input for parameter `drop`"),
      ([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
-     "`handle_unknown` must be 'error'"),
+      "`handle_unknown` must be 'error'"),
      ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
       {'drop': np.asarray('b', dtype=object)},
-     "Wrong input for parameter `drop`"),
+      "Wrong input for parameter `drop`"),
      ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
       {'drop': ['ghi', 3, 59]},
-     "The following categories were supposed")]
+      "The following categories were supposed")]
 )
 def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
     enc = OneHotEncoder(**params)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
@@ -653,4 +653,6 @@ def test_encode_check_unknown():
     values = np.array(['a', 'b', 'c', 'd'], dtype=object)
     with pytest.raises(ValueError,
                        match='y contains previously unseen labels'):
-        _encode(values, uniques, encode=True, check_unknown=False)
+        _encode(values, uniques, encode=True, check_unknown=True)
+
+    _encode(values, uniques, encode=True, check_unknown=False)