diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 4c68f9e635498..f3607ee55ebdf 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -481,8 +481,9 @@ new feature of integers (0 to n_categories - 1)::
 
     >>> enc = preprocessing.OrdinalEncoder()
     >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
-    >>> enc.fit(X)  # doctest: +ELLIPSIS
-    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
+    >>> enc.fit(X)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
+        missing_values=nan)
     >>> enc.transform([['female', 'from US', 'uses Safari']])
     array([[0., 1., 1.]])
 
@@ -505,8 +506,9 @@ Continuing the example above::
   >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
   >>> enc.fit(X)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
   OneHotEncoder(categorical_features=None, categories=None, drop=None,
-         dtype=<... 'numpy.float64'>, handle_unknown='error',
-         n_values=None, sparse=True)
+         dtype=<... 'numpy.float64'>, handle_missing='all-zero',
+         handle_unknown='error', missing_values=nan, n_values=None,
+         sparse=True)
   >>> enc.transform([['female', 'from US', 'uses Safari'],
   ...                ['male', 'from Europe', 'uses Safari']]).toarray()
   array([[1., 0., 0., 1., 0., 1.],
@@ -530,10 +532,10 @@ dataset::
     >>> # feature
     >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
     >>> enc.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-    OneHotEncoder(categorical_features=None,
-           categories=[...], drop=None,
-           dtype=<... 'numpy.float64'>, handle_unknown='error',
-           n_values=None, sparse=True)
+    OneHotEncoder(categorical_features=None, categories=[...], drop=None,
+           dtype=<... 'numpy.float64'>, handle_missing='all-zero',
+           handle_unknown='error', missing_values=nan, n_values=None,
+           sparse=True)
     >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
     array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])
 
@@ -549,8 +551,9 @@ columns for this feature will be all zeros
     >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
     >>> enc.fit(X) # doctest: +ELLIPSIS  +NORMALIZE_WHITESPACE
     OneHotEncoder(categorical_features=None, categories=None, drop=None,
-           dtype=<... 'numpy.float64'>, handle_unknown='ignore',
-           n_values=None, sparse=True)
+           dtype=<... 'numpy.float64'>, handle_missing='all-zero',
+           handle_unknown='ignore', missing_values=nan, n_values=None,
+           sparse=True)
     >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
     array([[1., 0., 0., 0., 0., 0.]])
 
diff --git a/sklearn/impute.py b/sklearn/impute.py
index ea4e8663d0313..2bcbe78971389 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -61,9 +61,14 @@ def _get_mask(X, value_to_mask):
             # np.isnan does not work on object dtypes.
             return _object_dtype_isnan(X)
     else:
-        # X == value_to_mask with object dytpes does not always perform
-        # element-wise for old versions of numpy
-        return np.equal(X, value_to_mask)
+        if X.dtype.kind in ["S", "U"]:
+            # np.equal does not work for byte string and unicode types.
+            # However the == sign works fine.
+            return X == value_to_mask
+        else:
+            # X == value_to_mask with object dytpes does not always perform
+            # element-wise for old versions of numpy
+            return np.equal(X, value_to_mask)
 
 
 def _most_frequent(array, extra_value, n_repeat):
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 6c3df0f22178e..7c5f4b0ed86b2 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -16,8 +16,7 @@
 from ..utils.validation import check_is_fitted
 
 from .base import _transform_selected
-from .label import _encode, _encode_check_unknown
-
+from .label import _nanencode
 
 __all__ = [
     'OneHotEncoder',
@@ -46,7 +45,7 @@ def _check_X(self, X):
         """
         if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
             # if not a dataframe, do normal check_array validation
-            X_temp = check_array(X, dtype=None)
+            X_temp = check_array(X, dtype=None, force_all_finite='allow-nan')
             if (not hasattr(X, 'dtype')
                     and np.issubdtype(X_temp.dtype, np.str_)):
                 X = check_array(X, dtype=np.object)
@@ -56,7 +55,7 @@ def _check_X(self, X):
         else:
             # pandas dataframe, do validation later column by column, in order
             # to keep the dtype information to be used in the encoder.
-            needs_validation = True
+            needs_validation = 'allow-nan'
 
         n_samples, n_features = X.shape
         X_columns = []
@@ -76,7 +75,7 @@ def _get_feature(self, X, feature_idx):
         # numpy arrays, sparse arrays
         return X[:, feature_idx]
 
-    def _fit(self, X, handle_unknown='error'):
+    def _fit(self, X, missing_values, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
 
         if self._categories != 'auto':
@@ -89,7 +88,7 @@ def _fit(self, X, handle_unknown='error'):
         for i in range(n_features):
             Xi = X_list[i]
             if self._categories == 'auto':
-                cats = _encode(Xi)
+                cats = _nanencode(Xi, missing_values=missing_values)
             else:
                 cats = np.array(self._categories[i], dtype=Xi.dtype)
                 if Xi.dtype != object:
@@ -97,47 +96,41 @@ def _fit(self, X, handle_unknown='error'):
                         raise ValueError("Unsorted categories are not "
                                          "supported for numerical categories")
                 if handle_unknown == 'error':
-                    diff = _encode_check_unknown(Xi, cats)
-                    if diff:
+                    try:
+                        _nanencode(Xi, cats, encode=True,
+                                   missing_values=missing_values)
+                    except ValueError as e:
+                        diff = e.args[0]
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
             self.categories_.append(cats)
 
-    def _transform(self, X, handle_unknown='error'):
+    def _transform(self, X, missing_values, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
 
         X_int = np.zeros((n_samples, n_features), dtype=np.int)
-        X_mask = np.ones((n_samples, n_features), dtype=np.bool)
+        X_missing_mask = np.zeros((n_samples, n_features), dtype=np.bool)
+        X_unknown_mask = np.zeros((n_samples, n_features), dtype=np.bool)
+        encode_unknown = handle_unknown != 'error'
 
         for i in range(n_features):
             Xi = X_list[i]
-            diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
-                                                     return_mask=True)
-
-            if not np.all(valid_mask):
-                if handle_unknown == 'error':
-                    msg = ("Found unknown categories {0} in column {1}"
-                           " during transform".format(diff, i))
-                    raise ValueError(msg)
-                else:
-                    # Set the problematic rows to an acceptable value and
-                    # continue `The rows are marked `X_mask` and will be
-                    # removed later.
-                    X_mask[:, i] = valid_mask
-                    # cast Xi into the largest string type necessary
-                    # to handle different lengths of numpy strings
-                    if (self.categories_[i].dtype.kind in ('U', 'S')
-                            and self.categories_[i].itemsize > Xi.itemsize):
-                        Xi = Xi.astype(self.categories_[i].dtype)
-                    else:
-                        Xi = Xi.copy()
+            encode_results = _nanencode(Xi, self.categories_[i],
+                                        missing_values=missing_values,
+                                        encode=True,
+                                        encode_unknown=encode_unknown)
+
+            if len(encode_results) == 4:
+                _, encoded, missing_mask, unknown_mask = encode_results
+                X_unknown_mask[:, i] = unknown_mask
+            else:
+                _, encoded, missing_mask = encode_results
 
-                    Xi[~valid_mask] = self.categories_[i][0]
-            _, encoded = _encode(Xi, self.categories_[i], encode=True)
             X_int[:, i] = encoded
+            X_missing_mask[:, i] = missing_mask
 
-        return X_int, X_mask
+        return X_int, X_missing_mask, X_unknown_mask
 
 
 class OneHotEncoder(_BaseEncoder):
@@ -194,6 +187,16 @@ class OneHotEncoder(_BaseEncoder):
     dtype : number type, default=np.float
         Desired dtype of output.
 
+    missing_values: scalar, default=np.nan
+        Value to be encoded as missing
+
+    handle_missing: 'all-zero', 'category', 'all-missing', default='all-zero'
+        Defines how the missing value should be handled
+
+        - 'all-zero' : the missing value is encoded as all zeros
+        - 'category' : another category is appended to flag missing values
+        - 'all-missing' : the missing value is encoded as all nan
+
     handle_unknown : 'error' or 'ignore', default='error'.
         Whether to raise an error or ignore if an unknown categorical feature
         is present during transform (default is to raise). When this parameter
@@ -280,8 +283,9 @@ class OneHotEncoder(_BaseEncoder):
     ... # doctest: +ELLIPSIS
     ... # doctest: +NORMALIZE_WHITESPACE
     OneHotEncoder(categorical_features=None, categories=None, drop=None,
-       dtype=<... 'numpy.float64'>, handle_unknown='ignore',
-       n_values=None, sparse=True)
+          dtype=<... 'numpy.float64'>, handle_missing='all-zero',
+          handle_unknown='ignore', missing_values=nan, n_values=None,
+          sparse=True)
 
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
@@ -317,12 +321,15 @@ class OneHotEncoder(_BaseEncoder):
 
     def __init__(self, n_values=None, categorical_features=None,
                  categories=None, drop=None, sparse=True, dtype=np.float64,
+                 missing_values=np.nan, handle_missing='all-zero',
                  handle_unknown='error'):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
         self.handle_unknown = handle_unknown
         self.n_values = n_values
+        self.missing_values = missing_values
+        self.handle_missing = handle_missing
         self.categorical_features = categorical_features
         self.drop = drop
 
@@ -489,7 +496,8 @@ def fit(self, X, y=None):
                                 copy=True)
             return self
         else:
-            self._fit(X, handle_unknown=self.handle_unknown)
+            self._fit(X, missing_values=self.missing_values,
+                      handle_unknown=self.handle_unknown)
             self.drop_idx_ = self._compute_drop_idx()
             return self
 
@@ -534,6 +542,10 @@ def _validate_keywords(self):
             msg = ("handle_unknown should be either 'error' or 'ignore', "
                    "got {0}.".format(self.handle_unknown))
             raise ValueError(msg)
+        if self.handle_missing not in ('all-zero', 'category', 'all-missing'):
+            msg = ("handle_unknown should be either 'all-zero', 'category', "
+                   "or 'all-missing', got {0}.".format(self.handle_missing))
+            raise ValueError(msg)
         # If we have both dropped columns and ignored unknown
         # values, there will be ambiguous cells. This creates difficulties
         # in interpreting the model.
@@ -673,12 +685,38 @@ def _legacy_transform(self, X):
 
         return out if self.sparse else out.toarray()
 
+    def _make_onehot_sparse_matrix(self, labels, mask, cat_ns):
+        flat_mask = mask.ravel()
+        n_values = np.array([0] + cat_ns)
+        feature_indices = np.cumsum(n_values)
+
+        indices = (labels + feature_indices[:-1]).ravel()[flat_mask]
+        indptr = mask.sum(axis=1).cumsum()
+        indptr = np.insert(indptr, 0, 0)
+        data = np.ones(len(indices))
+
+        out = sparse.csr_matrix((data, indices, indptr),
+                                shape=(len(labels), feature_indices[-1]),
+                                dtype=self.dtype)
+        return out
+
+    def _make_nan_sparse_matrix(self, Xi_missing, n_categories):
+        n_missing = np.sum(Xi_missing)
+        indptr = np.cumsum(Xi_missing) * n_categories
+        indptr = np.insert(indptr, 0, 0)
+        indices = np.full((n_missing, n_categories), np.arange(n_categories))
+        indices = indices.ravel()
+        data = np.full(len(indices), np.nan)
+        na_matrix = sparse.csr_matrix((data, indices, indptr),
+                                      (len(Xi_missing), n_categories))
+        return na_matrix
+
     def _transform_new(self, X):
         """New implementation assuming categorical input"""
         # validation of X happens in _check_X called by _transform
-        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
-
-        n_samples, n_features = X_int.shape
+        X_int, X_missing, X_unknown = (
+            self._transform(X, missing_values=self.missing_values,
+                            handle_unknown=self.handle_unknown))
 
         if self.drop is not None:
             to_drop = self.drop_idx_.reshape(1, -1)
@@ -686,24 +724,30 @@ def _transform_new(self, X):
             # We remove all the dropped categories from mask, and decrement all
             # categories that occur after them to avoid an empty column.
 
-            keep_cells = X_int != to_drop
-            X_mask &= keep_cells
+            drop_cells = X_int == to_drop
+            X_unknown |= drop_cells
             X_int[X_int > to_drop] -= 1
-            n_values = [len(cats) - 1 for cats in self.categories_]
+            cats_ns = [len(cats) - 1 for cats in self.categories_]
         else:
-            n_values = [len(cats) for cats in self.categories_]
-
-        mask = X_mask.ravel()
-        n_values = np.array([0] + n_values)
-        feature_indices = np.cumsum(n_values)
-        indices = (X_int + feature_indices[:-1]).ravel()[mask]
-        indptr = X_mask.sum(axis=1).cumsum()
-        indptr = np.insert(indptr, 0, 0)
-        data = np.ones(n_samples * n_features)[mask]
+            cats_ns = [len(cats) for cats in self.categories_]
+
+        if self.handle_missing == 'category':
+            for i, c in enumerate(cats_ns):
+                Xi_missing = X_missing[:, i]
+                Xi_int = X_int[:, i]
+                Xi_int[Xi_missing] = c
+            cats_ns = [c+1 for c in cats_ns]
+            X_valid = ~X_unknown
+            out = self._make_onehot_sparse_matrix(X_int, X_valid, cats_ns)
+        else:
+            X_valid = ~(X_missing | X_unknown)
+            out = self._make_onehot_sparse_matrix(X_int, X_valid, cats_ns)
+            if self.handle_missing == 'all-missing':
+                na_mat = [self._make_nan_sparse_matrix(X_missing[:, i], c)
+                          for i, c in enumerate(cats_ns)]
+                na_mat = sparse.hstack(na_mat)
+                out += na_mat
 
-        out = sparse.csr_matrix((data, indices, indptr),
-                                shape=(n_samples, feature_indices[-1]),
-                                dtype=self.dtype)
         if not self.sparse:
             return out.toarray()
         else:
@@ -751,16 +795,22 @@ def inverse_transform(self, X):
         #     raise ValueError("only supported for categorical features")
 
         check_is_fitted(self, 'categories_')
-        X = check_array(X, accept_sparse='csr')
+        if self.handle_missing == 'all-missing':
+            force_finite = 'allow-nan'
+        else:
+            force_finite = True
+        X = check_array(X, accept_sparse='csr', force_all_finite=force_finite)
 
         n_samples, _ = X.shape
         n_features = len(self.categories_)
-        if self.drop is None:
-            n_transformed_features = sum(len(cats)
-                                         for cats in self.categories_)
-        else:
-            n_transformed_features = sum(len(cats) - 1
-                                         for cats in self.categories_)
+
+        n_transformed_features = sum(len(cats) for cats in self.categories_)
+
+        if self.handle_missing == 'category':
+            n_transformed_features += len(self.categories_)
+
+        if self.drop is not None:
+            n_transformed_features -= len(self.categories_)
 
         # validate shape of passed X
         msg = ("Shape of the passed X data is not correct. Expected {0} "
@@ -769,17 +819,24 @@ def inverse_transform(self, X):
             raise ValueError(msg.format(n_transformed_features, X.shape[1]))
 
         # create resulting array of appropriate dtype
-        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
+        dt = np.find_common_type([cat.dtype for cat in self.categories_],
+                                 [type(self.missing_values)])
         X_tr = np.empty((n_samples, n_features), dtype=dt)
 
         j = 0
         found_unknown = {}
 
         for i in range(n_features):
-            if self.drop is None:
-                cats = self.categories_[i]
-            else:
-                cats = np.delete(self.categories_[i], self.drop_idx_[i])
+            cats = self.categories_[i]
+            if self.drop is not None:
+                cats = np.delete(cats, self.drop_idx_[i])
+            if self.handle_missing == 'category':
+                cdt = np.find_common_type([cats.dtype],
+                                          [type(self.missing_values)])
+                if cdt != cats.dtype:
+                    cats = cats.astype(cdt)
+                cats = np.append(cats, self.missing_values)
+
             n_categories = len(cats)
 
             # Only happens if there was a column with a unique
@@ -805,6 +862,19 @@ def inverse_transform(self, X):
                 dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
                 if dropped.any():
                     X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]
+            elif self.handle_missing == 'all-zeroes':
+                missing = np.asarray(sub.sum(axis=1) == 0).flatten()
+                X_tr[:, i][missing] = self.missing_values
+
+            if self.handle_missing == 'all-missing':
+                if sparse.isspmatrix_csr(sub):
+                    missing = sub.copy()
+                    missing.data = np.isnan(missing.data)
+                else:
+                    missing = np.isnan(sub)
+                missing = np.asarray(missing.sum(axis=1) == n_categories)
+                missing = missing.flatten()
+                X_tr[:, i][missing] = self.missing_values
 
             j += n_categories
 
@@ -894,7 +964,10 @@ class OrdinalEncoder(_BaseEncoder):
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
     >>> enc.fit(X)
     ... # doctest: +ELLIPSIS
-    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
+    ... # doctest: +NORMALIZE_WHITESPACE
+    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
+        missing_values=nan)
+
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
     >>> enc.transform([['Female', 3], ['Male', 1]])
@@ -913,9 +986,11 @@ class OrdinalEncoder(_BaseEncoder):
       between 0 and n_classes-1.
     """
 
-    def __init__(self, categories='auto', dtype=np.float64):
+    def __init__(self, categories='auto', dtype=np.float64,
+                 missing_values=np.nan):
         self.categories = categories
         self.dtype = dtype
+        self.missing_values = missing_values
 
     def fit(self, X, y=None):
         """Fit the OrdinalEncoder to X.
@@ -933,7 +1008,7 @@ def fit(self, X, y=None):
         # base classes uses _categories to deal with deprecations in
         # OneHoteEncoder: can be removed once deprecations are removed
         self._categories = self.categories
-        self._fit(X)
+        self._fit(X, missing_values=self.missing_values)
 
         return self
 
@@ -951,7 +1026,7 @@ def transform(self, X):
             Transformed input.
 
         """
-        X_int, _ = self._transform(X)
+        X_int, _, _ = self._transform(X, missing_values=self.missing_values)
         return X_int.astype(self.dtype, copy=False)
 
     def inverse_transform(self, X):
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index f7cffa1e663b5..1ce6c2ae6ba8c 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -17,7 +17,7 @@
 from ..base import BaseEstimator, TransformerMixin
 
 from ..utils.sparsefuncs import min_max_axis
-from ..utils import column_or_1d
+from ..utils import column_or_1d, is_scalar_nan
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _num_samples
@@ -33,131 +33,217 @@
 ]
 
 
-def _encode_numpy(values, uniques=None, encode=False):
-    # only used in _encode below, see docstring there for details
-    if uniques is None:
-        if encode:
-            uniques, encoded = np.unique(values, return_inverse=True)
-            return uniques, encoded
-        else:
-            # unique sorts
-            return np.unique(values)
-    if encode:
-        diff = _encode_check_unknown(values, uniques)
-        if diff:
-            raise ValueError("y contains previously unseen labels: %s"
-                             % str(diff))
-        encoded = np.searchsorted(uniques, values)
-        return uniques, encoded
+def _nanunique(ar, return_inverse=False):
+    if return_inverse:
+        uniques, reverse = np.unique(ar, return_inverse=return_inverse)
     else:
-        return uniques
+        uniques = np.unique(ar)
 
+    # np.nan is always sorted last
+    if len(uniques) and is_scalar_nan(uniques[-1]):
+        nan_idx = np.searchsorted(uniques, np.nan)
+        uniques = uniques[:nan_idx+1]
+        if return_inverse:
+            reverse[reverse > nan_idx] = nan_idx
 
-def _encode_python(values, uniques=None, encode=False):
-    # only used in _encode below, see docstring there for details
-    if uniques is None:
-        uniques = sorted(set(values))
-        uniques = np.array(uniques, dtype=values.dtype)
-    if encode:
-        table = {val: i for i, val in enumerate(uniques)}
-        try:
-            encoded = np.array([table[v] for v in values])
-        except KeyError as e:
-            raise ValueError("y contains previously unseen labels: %s"
-                             % str(e))
-        return uniques, encoded
+    if return_inverse:
+        return uniques, reverse
     else:
         return uniques
 
 
-def _encode(values, uniques=None, encode=False):
-    """Helper function to factorize (find uniques) and encode values.
+def _nanin1d(ar1, ar2, assume_unique=False):
+    ar1 = np.ravel(ar1)
+    ar2 = np.ravel(ar2)
 
-    Uses pure python method for object dtype, and numpy method for
-    all other dtypes.
-    The numpy method has the limitation that the `uniques` need to
-    be sorted. Importantly, this is not checked but assumed to already be
-    the case. The calling method needs to ensure this for all non-object
-    values.
+    if not assume_unique:
+        ar1, rev = _nanunique(ar1, return_inverse=True)
+        ar2 = _nanunique(ar2)
 
-    Parameters
-    ----------
-    values : array
-        Values to factorize or encode.
-    uniques : array, optional
-        If passed, uniques are not determined from passed values (this
-        can be because the user specified categories, or because they
-        already have been determined in fit).
-    encode : bool, default False
-        If True, also encode the values into integer codes based on `uniques`.
+    # The FutureWarning is usually triggered by a nan comparison so it might be
+    # better to just suppress the warning here
+    with warnings.catch_warnings():
+        warnings.simplefilter(action='ignore', category=FutureWarning)
+        in1d = np.in1d(ar1, ar2, True)
+    try:
+        in1d[-1] = (in1d[-1] or
+                    (is_scalar_nan(ar1[-1]) and is_scalar_nan(ar2[-1])))
+    except IndexError:
+        pass
 
-    Returns
-    -------
-    uniques
-        If ``encode=False``. The unique values are sorted if the `uniques`
-        parameter was None (and thus inferred from the data).
-    (uniques, encoded)
-        If ``encode=True``.
+    if assume_unique:
+        return in1d
+    else:
+        return in1d[rev]
 
-    """
-    if values.dtype == object:
-        try:
-            res = _encode_python(values, uniques, encode)
-        except TypeError:
-            raise TypeError("argument must be a string or number")
-        return res
+
+def _nansetdiff1d(ar1, ar2, assume_unique=False):
+    if assume_unique:
+        ar1 = np.ravel(ar1)
     else:
-        return _encode_numpy(values, uniques, encode)
+        ar1 = _nanunique(ar1)
+        ar2 = _nanunique(ar2)
+    return ar1[~_nanin1d(ar1, ar2, True)]
 
 
-def _encode_check_unknown(values, uniques, return_mask=False):
-    """
-    Helper function to check for unknowns in values to be encoded.
+def _nanencode_numpy(values, uniques=None, encode=False,
+                     missing_values=np.nan, encode_unknown=False):
+    check_values = True
+    if uniques is None:
+        uniques = _nanunique(values)
+        uniques = _nansetdiff1d(uniques, [missing_values], True)
+        check_values = False
 
-    Uses pure python method for object dtype, and numpy method for
-    all other dtypes.
+    if encode:
+        if check_values:
+            unique_values = _nanunique(values)
+            unseen = _nansetdiff1d(unique_values, [missing_values], True)
+            unseen = _nansetdiff1d(unseen, uniques, True)
 
-    Parameters
-    ----------
-    values : array
-        Values to check for unknowns.
-    uniques : array
-        Allowed uniques values.
-    return_mask : bool, default False
-        If True, return a mask of the same shape as `values` indicating
-        the valid values.
+            if not encode_unknown:
+                if len(unseen):
+                    raise ValueError("y contains previously unseen labels: %s"
+                                     % str(unseen), unseen)
 
-    Returns
-    -------
-    diff : list
-        The unique values present in `values` and not in `uniques` (the
-        unknown values).
-    valid_mask : boolean array
-        Additionally returned if ``return_mask=True``.
+        encoded = np.searchsorted(uniques, values)
+        from ..impute import _get_mask
+        missing_mask = _get_mask(values, missing_values)
 
-    """
-    if values.dtype == object:
-        uniques_set = set(uniques)
-        diff = list(set(values) - uniques_set)
-        if return_mask:
-            if diff:
-                valid_mask = np.array([val in uniques_set for val in values])
+        if encode_unknown:
+            if check_values:
+                unknown_mask = _nanin1d(values, unseen)
             else:
-                valid_mask = np.ones(len(values), dtype=bool)
-            return diff, valid_mask
+                unknown_mask = np.zeros_like(values, dtype=np.bool)
+            return uniques, encoded, missing_mask, unknown_mask
         else:
-            return diff
+            return uniques, encoded, missing_mask
     else:
-        unique_values = np.unique(values)
-        diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
-        if return_mask:
-            if diff:
-                valid_mask = np.in1d(values, uniques)
-            else:
-                valid_mask = np.ones(len(values), dtype=bool)
-            return diff, valid_mask
+        return uniques
+
+
+# Since it is more efficient to remove an item from a set, object exclusion
+# in included in the _nanunique_object. Also since None and np.nan are not
+# sortable, they are handled separately after the sort.
+def _nanunique_object(ar, exclude_value):
+    items = set(ar)
+
+    # nan might not be discarded since nan comes in different forms
+    items.discard(exclude_value)
+
+    # Handle None afterwards if it is in the items. Set a flag for now.
+    try:
+        items.remove(None)
+        has_none = True
+    except KeyError:
+        has_none = False
+
+    # Handle nan afterwards if it is in the items. Set a flag for now. Since
+    # nan can come in different forms, we check everything.
+    items_not_na = [i for i in items if not is_scalar_nan(i)]
+    has_na = len(items) > len(items_not_na)
+
+    # Sort without None and nan
+    uniques = sorted(items_not_na)
+
+    # Bring back None if needed
+    if has_none:
+        uniques.append(None)
+
+    # Being back nan if needed. Since nan comes in different forms, nan might
+    # exists despite discarding it from the set
+    if has_na and not is_scalar_nan(exclude_value):
+        uniques.append(np.nan)
+
+    return uniques
+
+
+# Since nan comes in multiple forms, hash is not enough to identify it
+def _dict_to_mapper(d, **kwargs):
+    try:
+        nan_value = kwargs['nan_value']
+
+        def mapper(x):
+            try:
+                return d[x]
+            except KeyError:
+                if is_scalar_nan(x):
+                    return nan_value
+                else:
+                    raise
+        return mapper
+    except KeyError:
+        return lambda x: d[x]
+
+
+def _make_mapper(uniques, missing_values, missing_index):
+    if is_scalar_nan(uniques[-1]):
+        # Nan value is the len(uniques) - 1
+        table = {val: i for i, val in enumerate(uniques[:-1])}
+        table[missing_values] = missing_index
+        table_mapper = _dict_to_mapper(table, nan_value=len(uniques) - 1)
+    else:
+        table = {val: i for i, val in enumerate(uniques)}
+        if is_scalar_nan(missing_values):
+            # Nan value is the missing index
+            table_mapper = _dict_to_mapper(table, nan_value=missing_index)
         else:
-            return diff
+            # No need for nan value
+            table[missing_values] = missing_index
+            table_mapper = _dict_to_mapper(table)
+    return table_mapper
+
+
+def _nanencode_python(values, uniques=None, encode=False,
+                      missing_values=np.nan, encode_unknown=False):
+    if uniques is None:
+        uniques = _nanunique_object(values, missing_values)
+        uniques = np.array(uniques, dtype=values.dtype)
+
+    if encode:
+        # Use index -1 so that the number encoding will not cause failures
+        # if used by the consumer for indexing. It will still fail when used
+        # for indexing an empty array but so is any other index.
+        missing_index = -1
+        mapper = _make_mapper(uniques, missing_values, missing_index)
+
+        if encode_unknown:
+            unknown_index = -2
+
+            def safe_mapper(x):
+                try:
+                    return mapper(x)
+                except KeyError:
+                    return unknown_index
+
+            np_mapper = safe_mapper
+        else:
+            np_mapper = mapper
+
+        try:
+            encoded = np.array([np_mapper(v) for v in values], dtype=np.int)
+            missing_mask = encoded == missing_index
+        except KeyError as e:
+            unseen = e.args[0]
+            raise ValueError("y contains previously unseen labels: %s"
+                             % str(unseen), unseen)
+
+        if encode_unknown:
+            unknown_mask = encoded == unknown_index
+            return uniques, encoded, missing_mask, unknown_mask
+        else:
+            return uniques, encoded, missing_mask
+    else:
+        return uniques
+
+
+def _nanencode(values, uniques=None, encode=False,
+               missing_values=np.nan, encode_unknown=False):
+    if values.dtype == object:
+        return _nanencode_python(values, uniques, encode,
+                                 missing_values, encode_unknown)
+    else:
+        return _nanencode_numpy(values, uniques, encode,
+                                missing_values, encode_unknown)
 
 
 class LabelEncoder(BaseEstimator, TransformerMixin):
@@ -217,7 +303,7 @@ def fit(self, y):
         self : returns an instance of self.
         """
         y = column_or_1d(y, warn=True)
-        self.classes_ = _encode(y)
+        self.classes_ = _nanencode(y)
         return self
 
     def fit_transform(self, y):
@@ -233,7 +319,7 @@ def fit_transform(self, y):
         y : array-like of shape [n_samples]
         """
         y = column_or_1d(y, warn=True)
-        self.classes_, y = _encode(y, encode=True)
+        self.classes_, y, _ = _nanencode(y, encode=True)
         return y
 
     def transform(self, y):
@@ -254,7 +340,7 @@ def transform(self, y):
         if _num_samples(y) == 0:
             return np.array([])
 
-        _, y = _encode(y, uniques=self.classes_, encode=True)
+        _, y, _ = _nanencode(y, uniques=self.classes_, encode=True)
         return y
 
     def inverse_transform(self, y):
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 850efe22b5d11..caf96226fba20 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -590,24 +590,55 @@ def test_one_hot_encoder_feature_names_unicode():
     assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names)
 
 
+@pytest.mark.parametrize('values, missing_values', [
+    (np.array([0.0, 1.0, np.nan, 2.0]), np.nan),
+    (np.array([0.0, 1.0, -1.0, 2.0]), -1.0),
+    (np.array([0, 1, -1, 2]), -1),
+    (np.array(list('abdc'), dtype='str'), 'd'),
+    (np.array(['a', 'b', None, 'c'], dtype=object), None),
+    (np.array(['a', 'b', np.nan, 'c'], dtype=object), np.nan),
+    (np.array(['a', 'b', 'd', 'c'], dtype=object), 'd'),
+])
+@pytest.mark.parametrize('handle_missing, expected', [
+    ('all-zero', np.array([[1, 0, 0], [0, 1, 0], [0, 0, 0], [0, 0, 1]])),
+    ('category', np.array([[1, 0, 0, 0], [0, 1, 0, 0],
+                           [0, 0, 0, 1], [0, 0, 1, 0]])),
+    ('all-missing', np.array([[1, 0, 0], [0, 1, 0], [np.nan] * 3, [0, 0, 1]]))
+])
+@pytest.mark.parametrize('sp', [True, False])
+def test_one_hot_encoder_handle_missing(values, missing_values,
+                                        handle_missing, expected, sp):
+    values = values.reshape(-1, 1)
+    enc = OneHotEncoder(categories='auto',
+                        handle_missing=handle_missing,
+                        missing_values=missing_values,
+                        sparse=sp)
+    result = enc.fit_transform(values)
+    cmp_result = result.toarray() if sp else result
+    assert_array_equal(expected, cmp_result)
+    nan_mask = np.array([0, 0, 1, 0], dtype=np.bool)
+
+    # There is a bug with assert_array_equal when comparing object arrays with
+    # nans. Thus, the nans are compared separately.
+    reverse = enc.inverse_transform(result)
+    assert_array_equal(values[~nan_mask], reverse[~nan_mask])
+    np.testing.assert_equal(values[~nan_mask], reverse[~nan_mask])
+
+
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
                                np.array([['a', np.nan]], dtype=object).T],
                          ids=['numeric', 'object'])
 @pytest.mark.parametrize("as_data_frame", [False, True],
                          ids=['array', 'dataframe'])
 @pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
-def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
+def test_one_hot_encoder_accept_missing(X, as_data_frame, handle_unknown):
     if as_data_frame:
         pd = pytest.importorskip('pandas')
         X = pd.DataFrame(X)
 
     ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit(X)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit_transform(X)
+    ohe.fit(X)
+    ohe.fit_transform(X)
 
     if as_data_frame:
         X_partial = X.iloc[:1, :]
@@ -615,9 +646,7 @@ def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
         X_partial = X[:1, :]
 
     ohe.fit(X_partial)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.transform(X)
+    ohe.transform(X)
 
 
 @pytest.mark.parametrize("X", [
@@ -678,19 +707,12 @@ def test_ordinal_encoder_inverse():
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
                                np.array([['a', np.nan]], dtype=object).T],
                          ids=['numeric', 'object'])
-def test_ordinal_encoder_raise_missing(X):
+def test_ordinal_encoder_accept_missing(X):
     ohe = OrdinalEncoder()
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit(X)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit_transform(X)
-
+    ohe.fit(X)
+    ohe.fit_transform(X)
     ohe.fit(X[:1, :])
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.transform(X)
+    ohe.transform(X)
 
 
 def test_encoder_dtypes():
@@ -732,7 +754,7 @@ def test_encoder_dtypes_pandas():
     assert_array_equal(enc.transform(X).toarray(), exp)
 
     X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
-    X_type = [int, object, float]
+    X_type = [X[col].dtype for col in X.columns]
     enc.fit(X)
     assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
     assert_array_equal(enc.transform(X).toarray(), exp)
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 8a7db601686a8..02f6a0edc34a9 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -25,7 +25,7 @@
 
 from sklearn.preprocessing.label import _inverse_binarize_thresholding
 from sklearn.preprocessing.label import _inverse_binarize_multiclass
-from sklearn.preprocessing.label import _encode
+from sklearn.preprocessing.label import _nanencode
 
 from sklearn import datasets
 
@@ -589,19 +589,160 @@ def test_inverse_binarize_multiclass():
 
 
 @pytest.mark.parametrize(
-        "values, expected",
+        "values, expected, extra_value",
         [(np.array([2, 1, 3, 1, 3], dtype='int64'),
-          np.array([1, 2, 3], dtype='int64')),
+          np.array([1, 2, 3], dtype='int64'),
+          4),
          (np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
-          np.array(['a', 'b', 'c'], dtype=object)),
+          np.array(['a', 'b', 'c'], dtype=object),
+          'd'),
          (np.array(['b', 'a', 'c', 'a', 'c']),
-          np.array(['a', 'b', 'c']))],
+          np.array(['a', 'b', 'c']),
+          'd')],
         ids=['int64', 'object', 'str'])
-def test_encode_util(values, expected):
-    uniques = _encode(values)
+def test_nanencode_util_as_encode(values, expected, extra_value):
+    uniques = _nanencode(values)
     assert_array_equal(uniques, expected)
-    uniques, encoded = _encode(values, encode=True)
+    uniques, encoded, na_mask = _nanencode(values, encode=True)
     assert_array_equal(uniques, expected)
     assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
-    _, encoded = _encode(values, uniques, encode=True)
+    assert_array_equal(na_mask, np.zeros_like(values, dtype=np.bool))
+    uniques_, encoded, na_mask = _nanencode(values, uniques, encode=True)
+    assert_array_equal(uniques_, uniques)
     assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+    assert_array_equal(na_mask, np.zeros_like(values, dtype=np.bool))
+
+    unclean_value = np.append(values, extra_value)
+    assert_raises(ValueError, _nanencode, unclean_value, uniques, encode=True)
+
+
+# There is a bug with assert_array_equal when comparing object arrays with
+# nans. Since nans are only seen at the end, the last items are asserted
+# separately. See https://github.com/numpy/numpy/issues/9023
+def nanassert_array_equal(a, b):
+    # The sklearn assert_equal does not support nans
+    np.testing.assert_equal(a[-1], b[-1])
+    assert_array_equal(a[:-1], b[:-1])
+
+
+@pytest.mark.parametrize(
+        "values, expected, missing_values",
+        [(np.array([2, 1, 3, 1, 3, np.nan, 2, np.float('nan')],
+                   dtype='float64'),
+          np.array([1, 2, 3], dtype='float64'), np.nan),
+         (np.array([2, 1, np.nan, 1, np.float('nan'), 3, 2, 3],
+                   dtype='float64'),
+          np.array([1, 2, np.nan], dtype='float64'), 3),
+         (np.array([2, 1, 3, 1, 3, 4, 2, 4], dtype='int64'),
+          np.array([1, 2, 3], dtype='int64'), 4),
+         (np.array(['b', 'a', np.nan, 'a', np.float('nan'), None, 'b', None],
+                   dtype=object),
+          np.array(['a', 'b', np.nan], dtype=object), None),
+         (np.array(['b', 'a', None, 'a', None, np.float('nan'), 'b', np.nan],
+                   dtype=object),
+          np.array(['a', 'b', None], dtype=object), np.nan),
+         (np.array(['b', 'a', None, 'a', None, 'c', 'b', 'c', ], dtype=object),
+          np.array(['a', 'b', None], dtype=object), 'c'),
+         (np.array(['b', 'a', 'c', 'a', 'c', 'd', 'b', 'd']),
+          np.array(['a', 'b', 'c']), 'd')],
+        ids=['float64_nan', 'float64_value', 'int64_value',
+             'object_none', 'object_nan', 'object_value', 'str_value'])
+def test_nanencode_util_missing(values, expected, missing_values):
+    encoding_answer = np.array([1, 0, 2, 0, 2, 1])
+    na_mask_answer = np.array([0, 0, 0, 0, 0, 1, 0, 1], dtype=np.bool)
+
+    uniques = _nanencode(values, missing_values=missing_values)
+    nanassert_array_equal(uniques, expected)
+
+    uniques_ = _nanencode(values, uniques, False, missing_values)
+    nanassert_array_equal(uniques_, uniques)
+
+    res = _nanencode(values, None, True, missing_values)
+    uniques, encoded, na_mask = res
+    nanassert_array_equal(uniques, expected)
+    assert_array_equal(na_mask, na_mask_answer)
+    assert_array_equal(encoded[~na_mask], encoding_answer)
+
+    res = _nanencode(values, uniques, True, missing_values)
+    uniques_, encoded, na_mask = res
+    nanassert_array_equal(uniques_, uniques)
+    assert_array_equal(na_mask, na_mask_answer)
+    assert_array_equal(encoded[~na_mask], encoding_answer)
+
+
+@pytest.mark.parametrize(
+        "fit_values, tr_values, expected, missing_values",
+        [(np.array([2, 1, 3, 1, 3, np.nan, 2, np.float('nan')],
+                   dtype='float64'),
+          np.array([4, 2, 1, 5, 3, 1, 3, np.nan, 2, np.float('nan')],
+                   dtype='float64'),
+          np.array([1, 2, 3], dtype='float64'), np.nan),
+         (np.array([2, 1, np.nan, 1, np.float('nan'), 3, 2, 3],
+                   dtype='float64'),
+          np.array([4, 2, 1, 5, np.nan, 1, np.float('nan'), 3, 2, 3],
+                   dtype='float64'),
+          np.array([1, 2, np.nan], dtype='float64'), 3),
+         (np.array([2, 1, 3, 1, 3, 4, 2, 4], dtype='int64'),
+          np.array([5, 2, 1, 6, 3, 1, 3, 4, 2, 4], dtype='int64'),
+          np.array([1, 2, 3], dtype='int64'), 4),
+         (np.array(['b', 'a', np.nan, 'a', np.float('nan'), None, 'b', None],
+                   dtype=object),
+          np.array(['d', 'b', 'a', 'e', np.nan, 'a', np.float('nan'),
+                    None, 'b', None], dtype=object),
+          np.array(['a', 'b', np.nan], dtype=object), None),
+         (np.array(['b', 'a', None, 'a', None, np.float('nan'), 'b', np.nan],
+                   dtype=object),
+          np.array(['c', 'b', 'a', 'd', None, 'a', None, np.float('nan'),
+                    'b', np.nan], dtype=object),
+          np.array(['a', 'b', None], dtype=object), np.nan),
+         (np.array(['b', 'a', None, 'a', None, 'c', 'b', 'c', ], dtype=object),
+          np.array(['d', 'b', 'a', 'e', None, 'a', None, 'c', 'b', 'c', ],
+                   dtype=object),
+          np.array(['a', 'b', None], dtype=object), 'c'),
+         (np.array(['b', 'a', 'c', 'a', 'c', 'd', 'b', 'd']),
+          np.array(['e', 'b', 'a', 'f', 'c', 'a', 'c', 'd', 'b', 'd']),
+          np.array(['a', 'b', 'c']), 'd')],
+        ids=['float64_nan', 'float64_value', 'int64_value',
+             'object_none', 'object_nan', 'object_value', 'str_value'])
+def test_nanencode_util_missing_unknown(fit_values, tr_values,
+                                        expected, missing_values):
+    encoding_answer = np.array([1, 0, 2, 0, 2, 1])
+    fit_na_mask_answer = np.array([0, 0, 0, 0, 0, 1, 0, 1], dtype=np.bool)
+    fit_unk_mask_answer = np.zeros_like(fit_na_mask_answer)
+    tr_na_mask_answer = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
+                                 dtype=np.bool)
+    tr_unk_mask_answer = np.array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
+                                  dtype=np.bool)
+
+    uniques = _nanencode(fit_values, None, False, missing_values, True)
+    nanassert_array_equal(uniques, expected)
+
+    uniques_ = _nanencode(fit_values, uniques, False, missing_values, True)
+    nanassert_array_equal(uniques_, uniques)
+
+    res = _nanencode(fit_values, None, True, missing_values, True)
+    uniques, encoded, na_mask, unk_mask = res
+    nanassert_array_equal(uniques, expected)
+    assert_array_equal(na_mask, fit_na_mask_answer)
+    assert_array_equal(unk_mask, fit_unk_mask_answer)
+    assert_array_equal(encoded[~(na_mask | unk_mask)], encoding_answer)
+
+    res = _nanencode(fit_values, uniques, True, missing_values, True)
+    uniques_, encoded, na_mask, unk_mask = res
+    nanassert_array_equal(uniques_, uniques)
+    assert_array_equal(na_mask, fit_na_mask_answer)
+    assert_array_equal(unk_mask, fit_unk_mask_answer)
+    assert_array_equal(encoded[~(na_mask | unk_mask)], encoding_answer)
+
+    uniques_ = _nanencode(tr_values, uniques, False, missing_values, True)
+    nanassert_array_equal(uniques_, uniques)
+
+    assert_raises(ValueError, _nanencode, tr_values, uniques, True,
+                  missing_values, False)
+
+    res = _nanencode(tr_values, uniques, True, missing_values, True)
+    uniques_, encoded, na_mask, unk_mask = res
+    nanassert_array_equal(uniques_, uniques)
+    assert_array_equal(na_mask, tr_na_mask_answer)
+    assert_array_equal(unk_mask, tr_unk_mask_answer)
+    assert_array_equal(encoded[~(na_mask | unk_mask)], encoding_answer)