From 3f9c591ec25929719acac7a2fd40ec7334dc2ff2 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Thu, 12 Sep 2019 09:58:41 +0200
Subject: [PATCH 01/16] label encode with nan and mixed types

---
 sklearn/preprocessing/_encoders.py           |  10 +-
 sklearn/preprocessing/label.py               | 101 ++++++++++++++++---
 sklearn/preprocessing/tests/test_encoders.py |  62 +++++++-----
 sklearn/preprocessing/tests/test_label.py    |  13 +++
 4 files changed, 140 insertions(+), 46 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index c33744204fc36..9104d1028a1b8 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -26,7 +26,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
 
     """
 
-    def _check_X(self, X):
+    def _check_X(self, X, force_all_finite=True):
         """
         Perform custom check_array:
         - convert list of strings to object dtype
@@ -40,10 +40,10 @@ def _check_X(self, X):
         """
         if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
             # if not a dataframe, do normal check_array validation
-            X_temp = check_array(X, dtype=None)
+            X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)
             if (not hasattr(X, 'dtype')
                     and np.issubdtype(X_temp.dtype, np.str_)):
-                X = check_array(X, dtype=np.object)
+                X = check_array(X, dtype=np.object, force_all_finite=force_all_finite)
             else:
                 X = X_temp
             needs_validation = False
@@ -71,7 +71,7 @@ def _get_feature(self, X, feature_idx):
         return X[:, feature_idx]
 
     def _fit(self, X, handle_unknown='error'):
-        X_list, n_samples, n_features = self._check_X(X)
+        X_list, n_samples, n_features = self._check_X(X, force_all_finite='allow-nan')
 
         if self.categories != 'auto':
             if len(self.categories) != n_features:
@@ -99,7 +99,7 @@ def _fit(self, X, handle_unknown='error'):
             self.categories_.append(cats)
 
     def _transform(self, X, handle_unknown='error'):
-        X_list, n_samples, n_features = self._check_X(X)
+        X_list, n_samples, n_features = self._check_X(X, force_all_finite=('allow-nan'))
 
         X_int = np.zeros((n_samples, n_features), dtype=np.int)
         X_mask = np.ones((n_samples, n_features), dtype=np.bool)
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index f16c7588fe13c..3dccab59cc5bc 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -18,12 +18,14 @@
 
 from ..utils.sparsefuncs import min_max_axis
 from ..utils import column_or_1d
+from ..utils import is_scalar_nan
+from ..utils.fixes import _object_dtype_isnan
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _num_samples
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
-
+from ..impute._base import _get_mask
 
 __all__ = [
     'label_binarize',
@@ -33,8 +35,17 @@
 ]
 
 
+def get_encoding(uniques, values):
+    if np.diff(uniques) > 0:
+        return np.searchsorted(uniques, values)
+    else:
+        table = {val: i for i, val in enumerate(uniques)}
+        return np.array([table[v] for v in values])
+
+
 def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
     # only used in _encode below, see docstring there for details
+    # excpect that `values` and `uniques` do not contains nan
     if uniques is None:
         if encode:
             uniques, encoded = np.unique(values, return_inverse=True)
@@ -48,16 +59,20 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
             if diff:
                 raise ValueError("y contains previously unseen labels: %s"
                                  % str(diff))
-        encoded = np.searchsorted(uniques, values)
+        encoded = get_encoding(uniques, values)
         return uniques, encoded
     else:
         return uniques
 
 
 def _encode_python(values, uniques=None, encode=False):
-    # only used in _encode below, see docstring there for details
+    # only used in _encode below, see docstring there for details.
     if uniques is None:
-        uniques = sorted(set(values))
+        try:
+            uniques = sorted(set(values))
+        except TypeError: 
+            # Couldn't sort with mixed type (str and float)
+            uniques = (set(values))
         uniques = np.array(uniques, dtype=values.dtype)
     if encode:
         table = {val: i for i, val in enumerate(uniques)}
@@ -71,15 +86,51 @@ def _encode_python(values, uniques=None, encode=False):
         return uniques
 
 
+def _encode_python_with_nan(values, uniques=None, encode=False):
+    # only used in _encode below, see docstring there for details
+    if uniques is None:
+        missing_vals = _get_mask(values, np.nan)
+        assert np.any(missing_vals)
+        # set([nan, nan]) = {nan, nan}
+        uniques = set(values[~missing_vals]) | {np.nan}
+        uniques = np.array(uniques, dtype=values.dtype)
+    if encode:
+        table = dict()
+        for i, val in enumerate(uniques):
+            if is_scalar_nan(val):
+                # table[nan] always raise KeyError
+                nan_index = i
+            else:
+                table[val] = i
+        try:
+            encoded = []
+            for val in values:
+                if is_scalar_nan(val):
+                    encoded.append(nan_index)
+                else:
+                    encoded.append(table[val])
+            encoded = np.array(encoded)
+        except KeyError as e:
+            raise ValueError("y contains previously unseen labels: %s"
+                             % str(e))
+        return uniques, encoded
+    else:
+        return uniques
+
+
+def _encode_numpy_with_nan(values, uniques=None, encode=False, check_unknown=True):
+    # `np.unique` does not work here 
+    _encode_python_with_nan(values, uniques, encode)
+
+
 def _encode(values, uniques=None, encode=False, check_unknown=True):
     """Helper function to factorize (find uniques) and encode values.
 
-    Uses pure python method for object dtype, and numpy method for
-    all other dtypes.
-    The numpy method has the limitation that the `uniques` need to
-    be sorted. Importantly, this is not checked but assumed to already be
-    the case. The calling method needs to ensure this for all non-object
-    values.
+    Uses pure python method for object dtype or if values contains nan,
+    and numpy method for all other dtypes.
+    If values contains nan or mixed type (e.g. str and float)
+    sorted become meaningless (but still nice to have since it 
+    speed up the `get_encoding`)
 
     Parameters
     ----------
@@ -107,16 +158,30 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
         If ``encode=True``.
 
     """
+    nan_in_uniques = False
+    # TODO use instead _assert_all_finite
+    if uniques is not None and np.any(_get_mask(uniques, np.nan)):
+        nan_in_uniques = True
+
     if values.dtype == object:
         try:
-            res = _encode_python(values, uniques, encode)
+            if np.any(_object_dtype_isnan(values)) or \
+               nan_in_uniques:
+                res = _encode_python_with_nan(values, uniques, encode)
+            else:
+                res = _encode_python(values, uniques, encode)
         except TypeError:
             raise TypeError("argument must be a string or number")
         return res
     else:
-        return _encode_numpy(values, uniques, encode,
+        if (values.dtype.kind == 'f' and np.isnan(values)) or \
+            nan_in_uniques:
+            # couldn't use `_encode_numpy` if `values` contains nan
+            res = _encode_python_with_nan(values, uniques, encode)
+        else:
+            res = _encode_numpy(values, uniques, encode,
                              check_unknown=check_unknown)
-
+        return res
 
 def _encode_check_unknown(values, uniques, return_mask=False):
     """
@@ -147,6 +212,11 @@ def _encode_check_unknown(values, uniques, return_mask=False):
     if values.dtype == object:
         uniques_set = set(uniques)
         diff = list(set(values) - uniques_set)
+        # set([np.nan]) - set([np.nan]) returns set([np.nan])
+        if diff and any(_object_dtype_isnan(diff)):
+            if any(_object_dtype_isnan(uniques_set)) and\
+                any(_object_dtype_isnan(set(values))):
+                diff = diff[~_object_dtype_isnan(diff)]
         if return_mask:
             if diff:
                 valid_mask = np.array([val in uniques_set for val in values])
@@ -158,6 +228,11 @@ def _encode_check_unknown(values, uniques, return_mask=False):
     else:
         unique_values = np.unique(values)
         diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
+        # np.setdiff1d([np.nan],[np.nan]) returns [np.nan]
+        if any(is_scalar_nan(diff)):
+            if any(is_scalar_nan(unique_values)) and\
+                any(is_scalar_nan(uniques)):
+                diff = [x for x in diff if not is_scalar_nan(x)]
         if return_mask:
             if diff:
                 valid_mask = np.in1d(values, uniques)
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 8e1a61781544a..1a11e287520ee 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -220,6 +220,31 @@ def test_one_hot_encoder(X):
     assert_allclose(Xtr.toarray(), [[0, 1, 1, 0,  1], [1, 0, 0, 1, 1]])
 
 
+@pytest.mark.parametrize("X", [
+    [['def', 1, np.nan], ['abc', 2, np.nan]],
+    np.array([[10, 1, np.nan], [5, 2, np.nan]]),
+    np.array([['b', 'A', np.nan], ['a', 'B', np.nan]], dtype=object)
+    ], ids=['mixed', 'numeric', 'object'])
+def test_one_hot_encoder_with_nan(X):
+    Xtr = check_categorical_onehot(np.array(X)[:, [0]])
+    assert_allclose(Xtr, [[0, 1], [1, 0]])
+
+    Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
+    assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
+
+    Xtr = OneHotEncoder(categories='auto').fit_transform(X)
+    assert_allclose(Xtr.toarray(), [[0, 1, 1, 0,  1], [1, 0, 0, 1, 1]])
+
+def test_ohe_handle_unknow_sparse_nan():
+    # TODO
+    pass
+
+
+def test_ohe_handle_unknow_nan():
+    # TODO
+    pass
+
+
 @pytest.mark.parametrize('sparse_', [False, True])
 @pytest.mark.parametrize('drop', [None, 'first'])
 def test_one_hot_encoder_inverse(sparse_, drop):
@@ -444,18 +469,14 @@ def test_one_hot_encoder_feature_names_drop(drop, expected_names):
 @pytest.mark.parametrize("as_data_frame", [False, True],
                          ids=['array', 'dataframe'])
 @pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
-def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
+def test_one_hot_encoder_accept_nan(X, as_data_frame, handle_unknown):
     if as_data_frame:
         pd = pytest.importorskip('pandas')
         X = pd.DataFrame(X)
 
     ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit(X)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit_transform(X)
+    ohe.fit(X)
+    ohe.fit_transform(X)
 
     if as_data_frame:
         X_partial = X.iloc[:1, :]
@@ -463,9 +484,7 @@ def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
         X_partial = X[:1, :]
 
     ohe.fit(X_partial)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.transform(X)
+    ohe.transform(X)
 
 
 @pytest.mark.parametrize("X", [
@@ -523,24 +542,6 @@ def test_ordinal_encoder_inverse():
     assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
 
 
-@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
-                               np.array([['a', np.nan]], dtype=object).T],
-                         ids=['numeric', 'object'])
-def test_ordinal_encoder_raise_missing(X):
-    ohe = OrdinalEncoder()
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit(X)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit_transform(X)
-
-    ohe.fit(X[:1, :])
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.transform(X)
-
-
 def test_ordinal_encoder_raise_categories_shape():
 
     X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
@@ -601,6 +602,11 @@ def test_one_hot_encoder_warning():
     X = [['Male', 1], ['Female', 3]]
     np.testing.assert_no_warnings(enc.fit_transform, X)
 
+def test_one_hot_encoder_accept_nan():
+    enc = OneHotEncoder()
+    X = [[np.nan, 1], ['Female', np.nan]] 
+    enc.fit_transform(X)
+
 
 def test_one_hot_encoder_drop_manual():
     cats_to_drop = ['def', 12, 3, 56]
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index a095f4ec64cab..5ce40941fc106 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -206,6 +206,19 @@ def test_label_encoder_negative_ints():
                        [0, 1, 4, 4, 5, -1, -1])
     assert_raises(ValueError, le.transform, [0, 6])
 
+def test_label_encode_with_nan():
+    assert len(_encode(np.asarray([np.nan, np.nan], dtype=float))) == 1
+    assert len(_encode(np.asarray([np.nan, np.nan], dtype=object))) == 1
+    assert len(_encode(np.asarray([4, 'm', np.nan]))) == 3
+    assert len(_encode(np.asarray([4, np.nan]))) == 2
+    
+    assert len(_encode(np.asarray([np.nan, np.nan],dtype=float), encode=True)[1]) == 2
+    assert len(_encode(np.asarray([np.nan, np.nan],dtype=object), encode=True)[1]) == 2
+    assert len(_encode(np.asarray([4, 'm', np.nan, np.nan, np.nan]), encode=True)[1]) == 5
+    assert len(_encode(np.asarray([4, np.nan, np.nan, np.nan]), encode=True)[1]) == 4
+
+def test_label_encode_with_mixed_type():   
+    assert len(_encode(np.asarray([4, 'm']))) == 2
 
 @pytest.mark.parametrize("dtype", ['str', 'object'])
 def test_label_encoder_str_bad_shape(dtype):

From 49448c1b23f9115822b5af34295719ad68e5e381 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 18 Sep 2019 09:30:21 +0200
Subject: [PATCH 02/16] chg noly label.py

---
 sklearn/preprocessing/label.py               | 96 +++++++-------------
 sklearn/preprocessing/tests/test_encoders.py |  5 -
 sklearn/preprocessing/tests/test_label.py    | 22 +++--
 3 files changed, 44 insertions(+), 79 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 3dccab59cc5bc..b8c801052fceb 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -43,9 +43,14 @@ def get_encoding(uniques, values):
         return np.array([table[v] for v in values])
 
 
-def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
+def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
+                  allow_nan=False):
     # only used in _encode below, see docstring there for details
-    # excpect that `values` and `uniques` do not contains nan
+
+    if allow_nan:
+        # `np.unique` does not work here
+        return _encode_python(values, uniques, encode,
+                              allow_nan)
     if uniques is None:
         if encode:
             uniques, encoded = np.unique(values, return_inverse=True)
@@ -59,40 +64,21 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
             if diff:
                 raise ValueError("y contains previously unseen labels: %s"
                                  % str(diff))
-        encoded = get_encoding(uniques, values)
+        encoded = np.searchsorted(uniques, values)
         return uniques, encoded
     else:
         return uniques
 
 
-def _encode_python(values, uniques=None, encode=False):
-    # only used in _encode below, see docstring there for details.
-    if uniques is None:
-        try:
-            uniques = sorted(set(values))
-        except TypeError: 
-            # Couldn't sort with mixed type (str and float)
-            uniques = (set(values))
-        uniques = np.array(uniques, dtype=values.dtype)
-    if encode:
-        table = {val: i for i, val in enumerate(uniques)}
-        try:
-            encoded = np.array([table[v] for v in values])
-        except KeyError as e:
-            raise ValueError("y contains previously unseen labels: %s"
-                             % str(e))
-        return uniques, encoded
-    else:
-        return uniques
-
-
-def _encode_python_with_nan(values, uniques=None, encode=False):
+def _encode_python(values, uniques=None, encode=False, allow_nan=False):
     # only used in _encode below, see docstring there for details
     if uniques is None:
-        missing_vals = _get_mask(values, np.nan)
-        assert np.any(missing_vals)
-        # set([nan, nan]) = {nan, nan}
-        uniques = set(values[~missing_vals]) | {np.nan}
+        if allow_nan:
+            missing_mask = _get_mask(values, np.nan)
+            if np.any(missing_mask):
+                uniques = sorted(set(values[~missing_mask]) | {np.nan})
+        else:
+            uniques = sorted(set(values))
         uniques = np.array(uniques, dtype=values.dtype)
     if encode:
         table = dict()
@@ -118,19 +104,12 @@ def _encode_python_with_nan(values, uniques=None, encode=False):
         return uniques
 
 
-def _encode_numpy_with_nan(values, uniques=None, encode=False, check_unknown=True):
-    # `np.unique` does not work here 
-    _encode_python_with_nan(values, uniques, encode)
-
-
-def _encode(values, uniques=None, encode=False, check_unknown=True):
+def _encode(values, uniques=None, encode=False, check_unknown=True,
+            allow_nan=False):
     """Helper function to factorize (find uniques) and encode values.
 
     Uses pure python method for object dtype or if values contains nan,
     and numpy method for all other dtypes.
-    If values contains nan or mixed type (e.g. str and float)
-    sorted become meaningless (but still nice to have since it 
-    speed up the `get_encoding`)
 
     Parameters
     ----------
@@ -158,30 +137,16 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
         If ``encode=True``.
 
     """
-    nan_in_uniques = False
-    # TODO use instead _assert_all_finite
-    if uniques is not None and np.any(_get_mask(uniques, np.nan)):
-        nan_in_uniques = True
-
     if values.dtype == object:
         try:
-            if np.any(_object_dtype_isnan(values)) or \
-               nan_in_uniques:
-                res = _encode_python_with_nan(values, uniques, encode)
-            else:
-                res = _encode_python(values, uniques, encode)
+            res = _encode_python(values, uniques, encode, allow_nan)
         except TypeError:
             raise TypeError("argument must be a string or number")
         return res
     else:
-        if (values.dtype.kind == 'f' and np.isnan(values)) or \
-            nan_in_uniques:
-            # couldn't use `_encode_numpy` if `values` contains nan
-            res = _encode_python_with_nan(values, uniques, encode)
-        else:
-            res = _encode_numpy(values, uniques, encode,
-                             check_unknown=check_unknown)
-        return res
+        return _encode_numpy(values, uniques, encode,
+                             check_unknown, allow_nan)
+
 
 def _encode_check_unknown(values, uniques, return_mask=False):
     """
@@ -212,11 +177,12 @@ def _encode_check_unknown(values, uniques, return_mask=False):
     if values.dtype == object:
         uniques_set = set(uniques)
         diff = list(set(values) - uniques_set)
-        # set([np.nan]) - set([np.nan]) returns set([np.nan])
-        if diff and any(_object_dtype_isnan(diff)):
-            if any(_object_dtype_isnan(uniques_set)) and\
-                any(_object_dtype_isnan(set(values))):
-                diff = diff[~_object_dtype_isnan(diff)]
+        # set([np.nan]) - set([np.nan]) returns set()
+        # but set(np.array([np.nan])) - set(np.array([np.nan])) return {nan}
+        if diff and any(_get_mask(diff, np.nan)):
+            if any(_get_mask(uniques_set, np.nan)) and\
+               any(_get_mask(set(values), np.nan)):
+                diff = diff[~_get_mask(diff, np.nan)]
         if return_mask:
             if diff:
                 valid_mask = np.array([val in uniques_set for val in values])
@@ -227,11 +193,11 @@ def _encode_check_unknown(values, uniques, return_mask=False):
             return diff
     else:
         unique_values = np.unique(values)
-        diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
+        diff = np.setdiff1d(unique_values, uniques, assume_unique=True)
         # np.setdiff1d([np.nan],[np.nan]) returns [np.nan]
-        if any(is_scalar_nan(diff)):
-            if any(is_scalar_nan(unique_values)) and\
-                any(is_scalar_nan(uniques)):
+        if any(_get_mask(diff, np.nan)):
+            if any(_get_mask(unique_values, np.nan)) and\
+               any(_get_mask(uniques, np.nan)):
                 diff = [x for x in diff if not is_scalar_nan(x)]
         if return_mask:
             if diff:
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 1a11e287520ee..d9c5092affd2d 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -602,11 +602,6 @@ def test_one_hot_encoder_warning():
     X = [['Male', 1], ['Female', 3]]
     np.testing.assert_no_warnings(enc.fit_transform, X)
 
-def test_one_hot_encoder_accept_nan():
-    enc = OneHotEncoder()
-    X = [[np.nan, 1], ['Female', np.nan]] 
-    enc.fit_transform(X)
-
 
 def test_one_hot_encoder_drop_manual():
     cats_to_drop = ['def', 12, 3, 56]
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 5ce40941fc106..de5c4a7191a91 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -206,20 +206,24 @@ def test_label_encoder_negative_ints():
                        [0, 1, 4, 4, 5, -1, -1])
     assert_raises(ValueError, le.transform, [0, 6])
 
+
 def test_label_encode_with_nan():
-    assert len(_encode(np.asarray([np.nan, np.nan], dtype=float))) == 1
-    assert len(_encode(np.asarray([np.nan, np.nan], dtype=object))) == 1
-    assert len(_encode(np.asarray([4, 'm', np.nan]))) == 3
-    assert len(_encode(np.asarray([4, np.nan]))) == 2
-    
-    assert len(_encode(np.asarray([np.nan, np.nan],dtype=float), encode=True)[1]) == 2
-    assert len(_encode(np.asarray([np.nan, np.nan],dtype=object), encode=True)[1]) == 2
-    assert len(_encode(np.asarray([4, 'm', np.nan, np.nan, np.nan]), encode=True)[1]) == 5
-    assert len(_encode(np.asarray([4, np.nan, np.nan, np.nan]), encode=True)[1]) == 4
+    assert len(_encode(np.asarray([np.nan, np.nan], dtype=float), allow_nan=True)) == 1
+    assert len(_encode(np.asarray([np.nan, np.nan], dtype=object), allow_nan=True)) == 1
+    assert len(_encode(np.asarray([4, np.nan]), allow_nan=True)) == 2
+
+    assert len(_encode(np.asarray([np.nan, np.nan], dtype=float),
+               encode=True, allow_nan=True)[1]) == 2
+    assert len(_encode(np.asarray([np.nan, np.nan], dtype=object),
+               encode=True, allow_nan=True)[1]) == 2
+    assert len(_encode(np.asarray([4, np.nan, np.nan, np.nan]),
+               encode=True, allow_nan=True)[1]) == 4
+
 
 def test_label_encode_with_mixed_type():   
     assert len(_encode(np.asarray([4, 'm']))) == 2
 
+
 @pytest.mark.parametrize("dtype", ['str', 'object'])
 def test_label_encoder_str_bad_shape(dtype):
     le = LabelEncoder()

From 302f3aedf135a37f9bb04124d93bcb2cf60f4575 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 18 Sep 2019 10:59:26 +0200
Subject: [PATCH 03/16] iter, restore _encoders

---
 sklearn/preprocessing/_encoders.py           | 483 ++++++++++++++++---
 sklearn/preprocessing/label.py               |  72 ++-
 sklearn/preprocessing/tests/test_encoders.py | 417 +++++++++++-----
 sklearn/preprocessing/tests/test_label.py    |  71 ++-
 4 files changed, 818 insertions(+), 225 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 9104d1028a1b8..c1acfbe799485 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -2,14 +2,20 @@
 #          Joris Van den Bossche <jorisvandenbossche@gmail.com>
 # License: BSD 3 clause
 
+import numbers
+import warnings
+
 import numpy as np
 from scipy import sparse
 
+from .. import get_config as _get_config
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
-from ..utils.fixes import _argmax
+from ..utils import deprecated
+from ..utils.fixes import _argmax, _object_dtype_isnan
 from ..utils.validation import check_is_fitted
 
+from .base import _transform_selected
 from .label import _encode, _encode_check_unknown
 
 
@@ -26,7 +32,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
 
     """
 
-    def _check_X(self, X, force_all_finite=True):
+    def _check_X(self, X):
         """
         Perform custom check_array:
         - convert list of strings to object dtype
@@ -40,10 +46,10 @@ def _check_X(self, X, force_all_finite=True):
         """
         if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
             # if not a dataframe, do normal check_array validation
-            X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)
+            X_temp = check_array(X, dtype=None)
             if (not hasattr(X, 'dtype')
                     and np.issubdtype(X_temp.dtype, np.str_)):
-                X = check_array(X, dtype=np.object, force_all_finite=force_all_finite)
+                X = check_array(X, dtype=np.object)
             else:
                 X = X_temp
             needs_validation = False
@@ -71,10 +77,10 @@ def _get_feature(self, X, feature_idx):
         return X[:, feature_idx]
 
     def _fit(self, X, handle_unknown='error'):
-        X_list, n_samples, n_features = self._check_X(X, force_all_finite='allow-nan')
+        X_list, n_samples, n_features = self._check_X(X)
 
-        if self.categories != 'auto':
-            if len(self.categories) != n_features:
+        if self._categories != 'auto':
+            if len(self._categories) != n_features:
                 raise ValueError("Shape mismatch: if categories is an array,"
                                  " it has to be of shape (n_features,).")
 
@@ -82,10 +88,10 @@ def _fit(self, X, handle_unknown='error'):
 
         for i in range(n_features):
             Xi = X_list[i]
-            if self.categories == 'auto':
+            if self._categories == 'auto':
                 cats = _encode(Xi)
             else:
-                cats = np.array(self.categories[i], dtype=Xi.dtype)
+                cats = np.array(self._categories[i], dtype=Xi.dtype)
                 if Xi.dtype != object:
                     if not np.all(np.sort(cats) == cats):
                         raise ValueError("Unsorted categories are not "
@@ -99,19 +105,11 @@ def _fit(self, X, handle_unknown='error'):
             self.categories_.append(cats)
 
     def _transform(self, X, handle_unknown='error'):
-        X_list, n_samples, n_features = self._check_X(X, force_all_finite=('allow-nan'))
+        X_list, n_samples, n_features = self._check_X(X)
 
         X_int = np.zeros((n_samples, n_features), dtype=np.int)
         X_mask = np.ones((n_samples, n_features), dtype=np.bool)
 
-        if n_features != len(self.categories_):
-            raise ValueError(
-                "The number of features in X is different to the number of "
-                "features of the fitted data. The fitted data had {} features "
-                "and the X has {} features."
-                .format(len(self.categories_,), n_features)
-            )
-
         for i in range(n_features):
             Xi = X_list[i]
             diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
@@ -136,31 +134,26 @@ def _transform(self, X, handle_unknown='error'):
                         Xi = Xi.copy()
 
                     Xi[~valid_mask] = self.categories_[i][0]
-            # We use check_unknown=False, since _encode_check_unknown was
-            # already called above.
-            _, encoded = _encode(Xi, self.categories_[i], encode=True,
-                                 check_unknown=False)
+            _, encoded = _encode(Xi, self.categories_[i], encode=True)
             X_int[:, i] = encoded
 
         return X_int, X_mask
 
-    def _more_tags(self):
-        return {'X_types': ['categorical']}
-
 
 class OneHotEncoder(_BaseEncoder):
-    """Encode categorical features as a one-hot numeric array.
+    """Encode categorical integer features as a one-hot numeric array.
 
     The input to this transformer should be an array-like of integers or
     strings, denoting the values taken on by categorical (discrete) features.
     The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
     encoding scheme. This creates a binary column for each category and
-    returns a sparse matrix or dense array (depending on the ``sparse``
-    parameter)
+    returns a sparse matrix or dense array.
 
     By default, the encoder derives the categories based on the unique values
     in each feature. Alternatively, you can also specify the `categories`
     manually.
+    The OneHotEncoder previously assumed that the input features take on
+    values in the range [0, max(values)). This behaviour is deprecated.
 
     This encoding is needed for feeding categorical data to many scikit-learn
     estimators, notably linear models and SVMs with the standard kernels.
@@ -209,6 +202,34 @@ class OneHotEncoder(_BaseEncoder):
         will be all zeros. In the inverse transform, an unknown category
         will be denoted as None.
 
+    n_values : 'auto', int or array of ints, default='auto'
+        Number of values per feature.
+
+        - 'auto' : determine value range from training data.
+        - int : number of categorical values per feature.
+                Each feature value should be in ``range(n_values)``
+        - array : ``n_values[i]`` is the number of categorical values in
+                  ``X[:, i]``. Each feature value should be
+                  in ``range(n_values[i])``
+
+        .. deprecated:: 0.20
+            The `n_values` keyword was deprecated in version 0.20 and will
+            be removed in 0.22. Use `categories` instead.
+
+    categorical_features : 'all' or array of indices or mask, default='all'
+        Specify what features are treated as categorical.
+
+        - 'all': All features are treated as categorical.
+        - array of indices: Array of categorical feature indices.
+        - mask: Array of length n_features and with dtype=bool.
+
+        Non-categorical features are always stacked to the right of the matrix.
+
+        .. deprecated:: 0.20
+            The `categorical_features` keyword was deprecated in version
+            0.20 and will be removed in 0.22.
+            You can use the ``ColumnTransformer`` instead.
+
     Attributes
     ----------
     categories_ : list of arrays
@@ -222,6 +243,31 @@ class OneHotEncoder(_BaseEncoder):
         be dropped for each feature. None if all the transformed features will
         be retained.
 
+    active_features_ : array
+        Indices for active features, meaning values that actually occur
+        in the training set. Only available when n_values is ``'auto'``.
+
+        .. deprecated:: 0.20
+            The ``active_features_`` attribute was deprecated in version
+            0.20 and will be removed in 0.22.
+
+    feature_indices_ : array of shape (n_features,)
+        Indices to feature ranges.
+        Feature ``i`` in the original data is mapped to features
+        from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
+        (and then potentially masked by ``active_features_`` afterwards)
+
+        .. deprecated:: 0.20
+            The ``feature_indices_`` attribute was deprecated in version
+            0.20 and will be removed in 0.22.
+
+    n_values_ : array of shape (n_features,)
+        Maximum number of values per feature.
+
+        .. deprecated:: 0.20
+            The ``n_values_`` attribute was deprecated in version
+            0.20 and will be removed in 0.22.
+
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique
@@ -231,7 +277,11 @@ class OneHotEncoder(_BaseEncoder):
     >>> enc = OneHotEncoder(handle_unknown='ignore')
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
     >>> enc.fit(X)
-    OneHotEncoder(handle_unknown='ignore')
+    ... # doctest: +ELLIPSIS
+    ... # doctest: +NORMALIZE_WHITESPACE
+    OneHotEncoder(categorical_features=None, categories=None, drop=None,
+       dtype=<... 'numpy.float64'>, handle_unknown='ignore',
+       n_values=None, sparse=True)
 
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
@@ -265,27 +315,184 @@ class OneHotEncoder(_BaseEncoder):
       matrix indicating the presence of a class label.
     """
 
-    def __init__(self, categories='auto', drop=None, sparse=True,
-                 dtype=np.float64, handle_unknown='error'):
+    def __init__(self, n_values=None, categorical_features=None,
+                 categories=None, drop=None, sparse=True, dtype=np.float64,
+                 handle_unknown='error'):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
         self.handle_unknown = handle_unknown
+        self.n_values = n_values
+        self.categorical_features = categorical_features
         self.drop = drop
 
-    def _validate_keywords(self):
-        if self.handle_unknown not in ('error', 'ignore'):
-            msg = ("handle_unknown should be either 'error' or 'ignore', "
-                   "got {0}.".format(self.handle_unknown))
-            raise ValueError(msg)
-        # If we have both dropped columns and ignored unknown
-        # values, there will be ambiguous cells. This creates difficulties
-        # in interpreting the model.
-        if self.drop is not None and self.handle_unknown != 'error':
+    # Deprecated attributes
+
+    @deprecated("The ``active_features_`` attribute was deprecated in version "
+                "0.20 and will be removed 0.22.")
+    @property
+    def active_features_(self):
+        check_is_fitted(self, 'categories_')
+        return self._active_features_
+
+    @deprecated("The ``feature_indices_`` attribute was deprecated in version "
+                "0.20 and will be removed 0.22.")
+    @property
+    def feature_indices_(self):
+        check_is_fitted(self, 'categories_')
+        return self._feature_indices_
+
+    @deprecated("The ``n_values_`` attribute was deprecated in version "
+                "0.20 and will be removed 0.22.")
+    @property
+    def n_values_(self):
+        check_is_fitted(self, 'categories_')
+        return self._n_values_
+
+    def _handle_deprecations(self, X):
+        # internal version of the attributes to handle deprecations
+        self._n_values = self.n_values
+        self._categories = getattr(self, '_categories', None)
+        self._categorical_features = getattr(self, '_categorical_features',
+                                             None)
+
+        # user manually set the categories or second fit -> never legacy mode
+        if self.categories is not None or self._categories is not None:
+            self._legacy_mode = False
+            if self.categories is not None:
+                self._categories = self.categories
+
+        # categories not set -> infer if we need legacy mode or not
+        elif self.n_values is not None and self.n_values != 'auto':
+            msg = (
+                "Passing 'n_values' is deprecated in version 0.20 and will be "
+                "removed in 0.22. You can use the 'categories' keyword "
+                "instead. 'n_values=n' corresponds to "
+                "'categories=[range(n)] * n_features'."
+            )
+            warnings.warn(msg, DeprecationWarning)
+            self._legacy_mode = True
+
+        else:  # n_values = 'auto'
+            # n_values can also be None (default to catch usage), so set
+            # _n_values to 'auto' explicitly
+            self._n_values = 'auto'
+            if self.handle_unknown == 'ignore':
+                # no change in behaviour, no need to raise deprecation warning
+                self._legacy_mode = False
+                self._categories = 'auto'
+                if self.n_values == 'auto':
+                    # user manually specified this
+                    msg = (
+                        "Passing 'n_values' is deprecated in version 0.20 and "
+                        "will be removed in 0.22. n_values='auto' can be "
+                        "replaced with categories='auto'."
+                    )
+                    warnings.warn(msg, DeprecationWarning)
+            else:
+                # check if we have integer or categorical input
+                try:
+                    check_array(X, dtype=np.int)
+                except ValueError:
+                    self._legacy_mode = False
+                    self._categories = 'auto'
+                else:
+                    if self.drop is None:
+                        msg = (
+                            "The handling of integer data will change in "
+                            "version 0.22. Currently, the categories are "
+                            "determined based on the range "
+                            "[0, max(values)], while in the future they "
+                            "will be determined based on the unique "
+                            "values.\nIf you want the future behaviour "
+                            "and silence this warning, you can specify "
+                            "\"categories='auto'\".\n"
+                            "In case you used a LabelEncoder before this "
+                            "OneHotEncoder to convert the categories to "
+                            "integers, then you can now use the "
+                            "OneHotEncoder directly."
+                        )
+                        warnings.warn(msg, FutureWarning)
+                        self._legacy_mode = True
+                    else:
+                        msg = (
+                            "The handling of integer data will change in "
+                            "version 0.22. Currently, the categories are "
+                            "determined based on the range "
+                            "[0, max(values)], while in the future they "
+                            "will be determined based on the unique "
+                            "values.\n The old behavior is not compatible "
+                            "with the `drop` parameter. Instead, you "
+                            "must manually specify \"categories='auto'\" "
+                            "if you wish to use the `drop` parameter on "
+                            "an array of entirely integer data. This will "
+                            "enable the future behavior."
+                        )
+                        raise ValueError(msg)
+
+        # if user specified categorical_features -> always use legacy mode
+        if self.categorical_features is not None:
+            if (isinstance(self.categorical_features, str)
+                    and self.categorical_features == 'all'):
+                warnings.warn(
+                    "The 'categorical_features' keyword is deprecated in "
+                    "version 0.20 and will be removed in 0.22. The passed "
+                    "value of 'all' is the default and can simply be removed.",
+                    DeprecationWarning)
+            else:
+                if self.categories is not None:
+                    raise ValueError(
+                        "The 'categorical_features' keyword is deprecated, "
+                        "and cannot be used together with specifying "
+                        "'categories'.")
+                warnings.warn(
+                    "The 'categorical_features' keyword is deprecated in "
+                    "version 0.20 and will be removed in 0.22. You can "
+                    "use the ColumnTransformer instead.", DeprecationWarning)
+                # Set categories_ to empty list if no categorical columns exist
+                n_features = X.shape[1]
+                sel = np.zeros(n_features, dtype=bool)
+                sel[np.asarray(self.categorical_features)] = True
+                if sum(sel) == 0:
+                    self.categories_ = []
+                self._legacy_mode = True
+            self._categorical_features = self.categorical_features
+        else:
+            self._categorical_features = 'all'
+
+        # Prevents new drop functionality from being used in legacy mode
+        if self._legacy_mode and self.drop is not None:
             raise ValueError(
-                "`handle_unknown` must be 'error' when the drop parameter is "
-                "specified, as both would create categories that are all "
-                "zero.")
+                "The `categorical_features` and `n_values` keywords "
+                "are deprecated, and cannot be used together "
+                "with 'drop'.")
+
+    def fit(self, X, y=None):
+        """Fit OneHotEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data to determine the categories of each feature.
+
+        Returns
+        -------
+        self
+        """
+
+        self._validate_keywords()
+
+        self._handle_deprecations(X)
+
+        if self._legacy_mode:
+            _transform_selected(X, self._legacy_fit_transform, self.dtype,
+                                self._categorical_features,
+                                copy=True)
+            return self
+        else:
+            self._fit(X, handle_unknown=self.handle_unknown)
+            self.drop_idx_ = self._compute_drop_idx()
+            return self
 
     def _compute_drop_idx(self):
         if self.drop is None:
@@ -323,22 +530,78 @@ def _compute_drop_idx(self):
                    "'first', None or array of objects, got {}")
             raise ValueError(msg.format(type(self.drop)))
 
-    def fit(self, X, y=None):
-        """Fit OneHotEncoder to X.
+    def _validate_keywords(self):
+        if self.handle_unknown not in ('error', 'ignore'):
+            msg = ("handle_unknown should be either 'error' or 'ignore', "
+                   "got {0}.".format(self.handle_unknown))
+            raise ValueError(msg)
+        # If we have both dropped columns and ignored unknown
+        # values, there will be ambiguous cells. This creates difficulties
+        # in interpreting the model.
+        if self.drop is not None and self.handle_unknown != 'error':
+            raise ValueError(
+                "`handle_unknown` must be 'error' when the drop parameter is "
+                "specified, as both would create categories that are all "
+                "zero.")
 
-        Parameters
-        ----------
-        X : array-like, shape [n_samples, n_features]
-            The data to determine the categories of each feature.
+    def _legacy_fit_transform(self, X):
+        """Assumes X contains only categorical features."""
+        dtype = getattr(X, 'dtype', None)
+        X = check_array(X, dtype=np.int)
+        if np.any(X < 0):
+            raise ValueError("OneHotEncoder in legacy mode cannot handle "
+                             "categories encoded as negative integers. "
+                             "Please set categories='auto' explicitly to "
+                             "be able to use arbitrary integer values as "
+                             "category identifiers.")
+        n_samples, n_features = X.shape
+        if (isinstance(self._n_values, str) and
+                self._n_values == 'auto'):
+            n_values = np.max(X, axis=0) + 1
+        elif isinstance(self._n_values, numbers.Integral):
+            if (np.max(X, axis=0) >= self._n_values).any():
+                raise ValueError("Feature out of bounds for n_values=%d"
+                                 % self._n_values)
+            n_values = np.empty(n_features, dtype=np.int)
+            n_values.fill(self._n_values)
+        else:
+            try:
+                n_values = np.asarray(self._n_values, dtype=int)
+            except (ValueError, TypeError):
+                raise TypeError("Wrong type for parameter `n_values`. Expected"
+                                " 'auto', int or array of ints, got %r"
+                                % type(self._n_values))
+            if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
+                raise ValueError("Shape mismatch: if n_values is an array,"
+                                 " it has to be of shape (n_features,).")
 
-        Returns
-        -------
-        self
-        """
-        self._validate_keywords()
-        self._fit(X, handle_unknown=self.handle_unknown)
-        self.drop_idx_ = self._compute_drop_idx()
-        return self
+        self._n_values_ = n_values
+        self.categories_ = [np.arange(n_val - 1, dtype=dtype)
+                            for n_val in n_values]
+        n_values = np.hstack([[0], n_values])
+        indices = np.cumsum(n_values)
+        self._feature_indices_ = indices
+
+        column_indices = (X + indices[:-1]).ravel()
+        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
+                                n_features)
+        data = np.ones(n_samples * n_features)
+        out = sparse.coo_matrix((data, (row_indices, column_indices)),
+                                shape=(n_samples, indices[-1]),
+                                dtype=self.dtype).tocsr()
+
+        if (isinstance(self._n_values, str) and
+                self._n_values == 'auto'):
+            mask = np.array(out.sum(axis=0)).ravel() != 0
+            active_features = np.where(mask)[0]
+            out = out[:, active_features]
+            self._active_features_ = active_features
+
+            self.categories_ = [
+                np.unique(X[:, i]).astype(dtype) if dtype
+                else np.unique(X[:, i]) for i in range(n_features)]
+
+        return out if self.sparse else out.toarray()
 
     def fit_transform(self, X, y=None):
         """Fit OneHotEncoder to X, then transform X.
@@ -355,23 +618,64 @@ def fit_transform(self, X, y=None):
         X_out : sparse matrix if sparse=True else a 2-d array
             Transformed input.
         """
+
         self._validate_keywords()
-        return super().fit_transform(X, y)
 
-    def transform(self, X):
-        """Transform X using one-hot encoding.
+        self._handle_deprecations(X)
 
-        Parameters
-        ----------
-        X : array-like, shape [n_samples, n_features]
-            The data to encode.
+        if self._legacy_mode:
+            return _transform_selected(
+                X, self._legacy_fit_transform, self.dtype,
+                self._categorical_features, copy=True)
+        else:
+            return self.fit(X).transform(X)
+
+    def _legacy_transform(self, X):
+        """Assumes X contains only categorical features."""
+        X = check_array(X, dtype=np.int)
+        if np.any(X < 0):
+            raise ValueError("OneHotEncoder in legacy mode cannot handle "
+                             "categories encoded as negative integers. "
+                             "Please set categories='auto' explicitly to "
+                             "be able to use arbitrary integer values as "
+                             "category identifiers.")
+        n_samples, n_features = X.shape
 
-        Returns
-        -------
-        X_out : sparse matrix if sparse=True else a 2-d array
-            Transformed input.
-        """
-        check_is_fitted(self)
+        indices = self._feature_indices_
+        if n_features != indices.shape[0] - 1:
+            raise ValueError("X has different shape than during fitting."
+                             " Expected %d, got %d."
+                             % (indices.shape[0] - 1, n_features))
+
+        # We use only those categorical features of X that are known using fit.
+        # i.e lesser than n_values_ using mask.
+        # This means, if self.handle_unknown is "ignore", the row_indices and
+        # col_indices corresponding to the unknown categorical feature are
+        # ignored.
+        mask = (X < self._n_values_).ravel()
+        if np.any(~mask):
+            if self.handle_unknown not in ['error', 'ignore']:
+                raise ValueError("handle_unknown should be either error or "
+                                 "unknown got %s" % self.handle_unknown)
+            if self.handle_unknown == 'error':
+                raise ValueError("unknown categorical feature present %s "
+                                 "during transform." % X.ravel()[~mask])
+
+        column_indices = (X + indices[:-1]).ravel()[mask]
+        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
+                                n_features)[mask]
+        data = np.ones(np.sum(mask))
+        out = sparse.coo_matrix((data, (row_indices, column_indices)),
+                                shape=(n_samples, indices[-1]),
+                                dtype=self.dtype).tocsr()
+        if (isinstance(self._n_values, str) and
+                self._n_values == 'auto'):
+            out = out[:, self._active_features_]
+
+        return out if self.sparse else out.toarray()
+
+    def _transform_new(self, X):
+        """New implementation assuming categorical input"""
         # validation of X happens in _check_X called by _transform
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
 
@@ -406,6 +710,27 @@ def transform(self, X):
         else:
             return out
 
+    def transform(self, X):
+        """Transform X using one-hot encoding.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data to encode.
+
+        Returns
+        -------
+        X_out : sparse matrix if sparse=True else a 2-d array
+            Transformed input.
+        """
+        check_is_fitted(self, 'categories_')
+        if self._legacy_mode:
+            return _transform_selected(X, self._legacy_transform, self.dtype,
+                                       self._categorical_features,
+                                       copy=True)
+        else:
+            return self._transform_new(X)
+
     def inverse_transform(self, X):
         """Convert the back data to the original representation.
 
@@ -423,7 +748,10 @@ def inverse_transform(self, X):
             Inverse transformed array.
 
         """
-        check_is_fitted(self)
+        # if self._legacy_mode:
+        #     raise ValueError("only supported for categorical features")
+
+        check_is_fitted(self, 'categories_')
         X = check_array(X, accept_sparse='csr')
 
         n_samples, _ = X.shape
@@ -506,7 +834,7 @@ def get_feature_names(self, input_features=None):
         output_feature_names : array of string, length n_output_features
 
         """
-        check_is_fitted(self)
+        check_is_fitted(self, 'categories_')
         cats = self.categories_
         if input_features is None:
             input_features = ['x%d' % i for i in range(len(cats))]
@@ -568,7 +896,8 @@ class OrdinalEncoder(_BaseEncoder):
     >>> enc = OrdinalEncoder()
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
     >>> enc.fit(X)
-    OrdinalEncoder()
+    ... # doctest: +ELLIPSIS
+    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
     >>> enc.transform([['Female', 3], ['Male', 1]])
@@ -604,6 +933,9 @@ def fit(self, X, y=None):
         self
 
         """
+        # base classes uses _categories to deal with deprecations in
+        # OneHoteEncoder: can be removed once deprecations are removed
+        self._categories = self.categories
         self._fit(X)
 
         return self
@@ -639,7 +971,7 @@ def inverse_transform(self, X):
             Inverse transformed array.
 
         """
-        check_is_fitted(self)
+        check_is_fitted(self, 'categories_')
         X = check_array(X, accept_sparse='csr')
 
         n_samples, _ = X.shape
@@ -660,3 +992,6 @@ def inverse_transform(self, X):
             X_tr[:, i] = self.categories_[i][labels]
 
         return X_tr
+
+    def _more_tags(self):
+        return {'X_types': ['categorical']}
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index b8c801052fceb..3adf40cc519b2 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -47,20 +47,31 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
                   allow_nan=False):
     # only used in _encode below, see docstring there for details
 
-    if allow_nan:
-        # `np.unique` does not work here
-        return _encode_python(values, uniques, encode,
-                              allow_nan)
     if uniques is None:
         if encode:
             uniques, encoded = np.unique(values, return_inverse=True)
+            # np.nan is always sorted last
+            if len(uniques) and is_scalar_nan(uniques[-1]):
+                if not allow_nan:
+                    raise ValueError('nan found in values and allow_nan=False')
+                nan_idx = np.searchsorted(uniques, np.nan)
+                uniques = uniques[:nan_idx+1]
+                if encode:
+                    encoded[encoded > nan_idx] = nan_idx
             return uniques, encoded
         else:
             # unique sorts
-            return np.unique(values)
+            uniques = np.unique(values)
+            # np.nan is always sorted last
+            if len(uniques) and is_scalar_nan(uniques[-1]):
+                if not allow_nan:
+                    raise ValueError('nan found in values and allow_nan=False')
+                nan_idx = np.searchsorted(uniques, np.nan)
+                uniques = uniques[:nan_idx+1]
+            return uniques
     if encode:
         if check_unknown:
-            diff = _encode_check_unknown(values, uniques)
+            diff = _encode_check_unknown(values, uniques, allow_nan=allow_nan)
             if diff:
                 raise ValueError("y contains previously unseen labels: %s"
                                  % str(diff))
@@ -73,10 +84,13 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
 def _encode_python(values, uniques=None, encode=False, allow_nan=False):
     # only used in _encode below, see docstring there for details
     if uniques is None:
-        if allow_nan:
-            missing_mask = _get_mask(values, np.nan)
-            if np.any(missing_mask):
-                uniques = sorted(set(values[~missing_mask]) | {np.nan})
+        missing_mask = _get_mask(values, np.nan)
+        if np.any(missing_mask):
+            if not allow_nan:
+                raise ValueError('nan found in values and allow_nan=False')
+            else:
+                # sorted([4, np.nan]) != np.sort([4, np.nan])
+                uniques = np.sort(list(set(values[~missing_mask]) | {np.nan}))
         else:
             uniques = sorted(set(values))
         uniques = np.array(uniques, dtype=values.dtype)
@@ -99,6 +113,11 @@ def _encode_python(values, uniques=None, encode=False, allow_nan=False):
         except KeyError as e:
             raise ValueError("y contains previously unseen labels: %s"
                              % str(e))
+        except UnboundLocalError as e:
+            # 'nan_index' referenced before assignment
+            raise ValueError("y contains previously unseen label nan, "
+                             "consider using allow_nan=True. %s"
+                             % str(e))
         return uniques, encoded
     else:
         return uniques
@@ -108,8 +127,12 @@ def _encode(values, uniques=None, encode=False, check_unknown=True,
             allow_nan=False):
     """Helper function to factorize (find uniques) and encode values.
 
-    Uses pure python method for object dtype or if values contains nan,
-    and numpy method for all other dtypes.
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+    The numpy method has the limitation that the `uniques` need to
+    be sorted. Importantly, this is not checked but assumed to already be
+    the case. The calling method needs to ensure this for all non-object
+    values.
 
     Parameters
     ----------
@@ -127,6 +150,9 @@ def _encode(values, uniques=None, encode=False, check_unknown=True,
         True in this case. This parameter is useful for
         _BaseEncoder._transform() to avoid calling _encode_check_unknown()
         twice.
+    allow_nan : bool, default False
+        if True, encode np.nan as another category. Otherwise raise an error
+        if nan are present
 
     Returns
     -------
@@ -148,7 +174,7 @@ def _encode(values, uniques=None, encode=False, check_unknown=True,
                              check_unknown, allow_nan)
 
 
-def _encode_check_unknown(values, uniques, return_mask=False):
+def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
     """
     Helper function to check for unknowns in values to be encoded.
 
@@ -176,13 +202,16 @@ def _encode_check_unknown(values, uniques, return_mask=False):
     """
     if values.dtype == object:
         uniques_set = set(uniques)
-        diff = list(set(values) - uniques_set)
+        diff = np.array(list(set(values) - uniques_set))
         # set([np.nan]) - set([np.nan]) returns set()
         # but set(np.array([np.nan])) - set(np.array([np.nan])) return {nan}
         if diff and any(_get_mask(diff, np.nan)):
-            if any(_get_mask(uniques_set, np.nan)) and\
-               any(_get_mask(set(values), np.nan)):
-                diff = diff[~_get_mask(diff, np.nan)]
+            if not allow_nan:
+                raise ValueError('Nan found during check_unknown')
+            else:
+                if any(_get_mask(uniques_set, np.nan)) and\
+                   any(_get_mask(set(values), np.nan)):
+                    diff = diff[~_get_mask(diff, np.nan)]
         if return_mask:
             if diff:
                 valid_mask = np.array([val in uniques_set for val in values])
@@ -196,9 +225,12 @@ def _encode_check_unknown(values, uniques, return_mask=False):
         diff = np.setdiff1d(unique_values, uniques, assume_unique=True)
         # np.setdiff1d([np.nan],[np.nan]) returns [np.nan]
         if any(_get_mask(diff, np.nan)):
-            if any(_get_mask(unique_values, np.nan)) and\
-               any(_get_mask(uniques, np.nan)):
-                diff = [x for x in diff if not is_scalar_nan(x)]
+            if not allow_nan:
+                raise ValueError('Nan found during check_unknown')
+            else:
+                if any(_get_mask(unique_values, np.nan)) and\
+                   any(_get_mask(uniques, np.nan)):
+                    diff = [x for x in diff if not is_scalar_nan(x)]
         if return_mask:
             if diff:
                 valid_mask = np.in1d(values, uniques)
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index d9c5092affd2d..29cd6602e4f10 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -8,8 +8,14 @@
 
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.testing import assert_array_equal
+from sklearn.utils.testing import assert_equal
+from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.testing import assert_warns
+from sklearn.utils.testing import assert_warns_message
+from sklearn.utils.testing import assert_no_warnings
 
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
@@ -21,37 +27,228 @@ def toarray(a):
     return a
 
 
-def test_one_hot_encoder_sparse_dense():
-    # check that sparse and dense will give the same results
+def test_one_hot_encoder_sparse():
+    # Test OneHotEncoder's fit and transform.
+    X = [[3, 2, 1], [0, 1, 1]]
+    enc = OneHotEncoder()
+    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
+        # discover max values automatically
+        X_trans = enc.fit_transform(X).toarray()
+        assert_equal(X_trans.shape, (2, 5))
+        assert_array_equal(enc.active_features_,
+                           np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
+        assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
+
+        # check outcome
+        assert_array_equal(X_trans,
+                           [[0., 1., 0., 1., 1.],
+                            [1., 0., 1., 0., 1.]])
+
+    # max value given as 3
+    # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=4)
+    enc = OneHotEncoder(n_values=4)
+    with ignore_warnings(category=DeprecationWarning):
+        X_trans = enc.fit_transform(X)
+        assert_equal(X_trans.shape, (2, 4 * 3))
+        assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])
+
+    # max value given per feature
+    # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=[3, 2, 2])
+    enc = OneHotEncoder(n_values=[3, 2, 2])
+    with ignore_warnings(category=DeprecationWarning):
+        X = [[1, 0, 1], [0, 1, 1]]
+        X_trans = enc.fit_transform(X)
+        assert_equal(X_trans.shape, (2, 3 + 2 + 2))
+        assert_array_equal(enc.n_values_, [3, 2, 2])
+    # check that testing with larger feature works:
+    X = np.array([[2, 0, 1], [0, 1, 1]])
+    enc.transform(X)
+
+    # test that an error is raised when out of bounds:
+    X_too_large = [[0, 2, 1], [0, 1, 1]]
+    assert_raises(ValueError, enc.transform, X_too_large)
+    error_msg = r"unknown categorical feature present \[2\] during transform"
+    assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large)
+    with ignore_warnings(category=DeprecationWarning):
+        assert_raises(
+            ValueError,
+            OneHotEncoder(n_values=2).fit_transform, X)
+
+    # test that error is raised when wrong number of features
+    assert_raises(ValueError, enc.transform, X[:, :-1])
+
+    # test that error is raised when wrong number of features in fit
+    # with prespecified n_values
+    with ignore_warnings(category=DeprecationWarning):
+        assert_raises(ValueError, enc.fit, X[:, :-1])
+    # test exception on wrong init param
+    with ignore_warnings(category=DeprecationWarning):
+        assert_raises(
+            TypeError, OneHotEncoder(n_values=np.int).fit, X)
 
-    X = np.array([[3, 2, 1], [0, 1, 1]])
-    enc_sparse = OneHotEncoder()
-    enc_dense = OneHotEncoder(sparse=False)
+    enc = OneHotEncoder()
+    # test negative input to fit
+    with ignore_warnings(category=FutureWarning):
+        assert_raises(ValueError, enc.fit, [[0], [-1]])
+
+    # test negative input to transform
+    with ignore_warnings(category=FutureWarning):
+        enc.fit([[0], [1]])
+    assert_raises(ValueError, enc.transform, [[0], [-1]])
+
+    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
+        enc = OneHotEncoder(drop='first', n_values=1)
+        for method in (enc.fit, enc.fit_transform):
+            assert_raises_regex(
+                ValueError,
+                'The `categorical_features` and `n_values` keywords ',
+                method, [[0], [-1]])
+
+            enc = OneHotEncoder(drop='first', categorical_features='all')
+            assert_raises_regex(
+                ValueError,
+                'The `categorical_features` and `n_values` keywords ',
+                method, [[0], [-1]])
+
+
+def test_one_hot_encoder_dense():
+    # check for sparse=False
+    X = [[3, 2, 1], [0, 1, 1]]
+    enc = OneHotEncoder(sparse=False)
+    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
+        # discover max values automatically
+        X_trans = enc.fit_transform(X)
+        assert_equal(X_trans.shape, (2, 5))
+        assert_array_equal(enc.active_features_,
+                           np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
+        assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
 
-    X_trans_sparse = enc_sparse.fit_transform(X)
-    X_trans_dense = enc_dense.fit_transform(X)
+    # check outcome
+    assert_array_equal(X_trans,
+                       np.array([[0., 1., 0., 1., 1.],
+                                 [1., 0., 1., 0., 1.]]))
+
+
+def test_one_hot_encoder_deprecationwarnings():
+    for X in [[[3, 2, 1], [0, 1, 1]],
+              [[3., 2., 1.], [0., 1., 1.]]]:
+        enc = OneHotEncoder()
+        assert_warns_message(FutureWarning, "handling of integer",
+                             enc.fit, X)
+        enc = OneHotEncoder()
+        assert_warns_message(FutureWarning, "handling of integer",
+                             enc.fit_transform, X)
+
+        # check it still works correctly as well
+        with ignore_warnings(category=FutureWarning):
+            X_trans = enc.fit_transform(X).toarray()
+        res = [[0., 1., 0., 1., 1.],
+               [1., 0., 1., 0., 1.]]
+        assert_array_equal(X_trans, res)
+
+        # check deprecated attributes
+        assert_warns(DeprecationWarning, lambda: enc.active_features_)
+        assert_warns(DeprecationWarning, lambda: enc.feature_indices_)
+        assert_warns(DeprecationWarning, lambda: enc.n_values_)
+
+        # check no warning is raised if keyword is specified
+        enc = OneHotEncoder(categories='auto')
+        assert_no_warnings(enc.fit, X)
+        enc = OneHotEncoder(categories='auto')
+        assert_no_warnings(enc.fit_transform, X)
+        X_trans = enc.fit_transform(X).toarray()
+        assert_array_equal(X_trans, res)
 
-    assert X_trans_sparse.shape == (2, 5)
-    assert X_trans_dense.shape == (2, 5)
+        # check there is also a warning if the default is passed
+        enc = OneHotEncoder(n_values='auto', handle_unknown='ignore')
+        assert_warns(DeprecationWarning, enc.fit, X)
 
-    assert sparse.issparse(X_trans_sparse)
-    assert not sparse.issparse(X_trans_dense)
+    X = np.array([['cat1', 'cat2']], dtype=object).T
+    enc = OneHotEncoder(categorical_features='all')
+    assert_warns(DeprecationWarning, enc.fit, X)
 
-    # check outcome
-    assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.],
-                                                  [1., 0., 1., 0., 1.]])
-    assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
 
+def test_one_hot_encoder_force_new_behaviour():
+    # ambiguous integer case (non secutive range of categories)
+    X = np.array([[1, 2]]).T
+    X2 = np.array([[0, 1]]).T
 
-def test_one_hot_encoder_diff_n_features():
-    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
-    X2 = np.array([[1, 0]])
+    # without argument -> by default using legacy behaviour with warnings
     enc = OneHotEncoder()
+
+    with ignore_warnings(category=FutureWarning):
+        enc.fit(X)
+
+    res = enc.transform(X2)
+    exp = np.array([[0, 0], [1, 0]])
+    assert_array_equal(res.toarray(), exp)
+
+    # with explicit auto argument -> don't use legacy behaviour
+    # (so will raise an error on unseen value within range)
+    enc = OneHotEncoder(categories='auto')
     enc.fit(X)
-    err_msg = ("The number of features in X is different to the number of "
-               "features of the fitted data.")
-    with pytest.raises(ValueError, match=err_msg):
-        enc.transform(X2)
+    assert_raises(ValueError, enc.transform, X2)
+
+
+def _run_one_hot(X, X2, cat):
+    # enc = assert_warns(
+    #     DeprecationWarning,
+    #     OneHotEncoder, categorical_features=cat)
+    enc = OneHotEncoder(categorical_features=cat)
+    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
+        Xtr = enc.fit_transform(X)
+    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
+        X2tr = enc.fit(X).transform(X2)
+    return Xtr, X2tr
+
+
+def _check_one_hot(X, X2, cat, n_features):
+    ind = np.where(cat)[0]
+    # With mask
+    A, B = _run_one_hot(X, X2, cat)
+    # With indices
+    C, D = _run_one_hot(X, X2, ind)
+    # Check shape
+    assert_equal(A.shape, (2, n_features))
+    assert_equal(B.shape, (1, n_features))
+    assert_equal(C.shape, (2, n_features))
+    assert_equal(D.shape, (1, n_features))
+    # Check that mask and indices give the same results
+    assert_array_equal(toarray(A), toarray(C))
+    assert_array_equal(toarray(B), toarray(D))
+
+
+def test_one_hot_encoder_categorical_features():
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    X2 = np.array([[1, 1, 1]])
+
+    cat = [True, False, False]
+    _check_one_hot(X, X2, cat, 4)
+
+    # Edge case: all non-categorical
+    cat = [False, False, False]
+    _check_one_hot(X, X2, cat, 3)
+
+    # Edge case: all categorical
+    cat = [True, True, True]
+    _check_one_hot(X, X2, cat, 5)
+
+    # check error raised if also specifying categories
+    oh = OneHotEncoder(categories=[range(3)],
+                       categorical_features=[True, False, False])
+    assert_raises(ValueError, oh.fit, X)
+
+
+def test_one_hot_encoder_categorical_features_ignore_unknown():
+    # GH12881 bug in combination of categorical_features with ignore
+    X = np.array([[1, 2, 3], [4, 5, 6], [2, 3, 2]]).T
+    oh = OneHotEncoder(categorical_features=[2], handle_unknown='ignore')
+
+    with ignore_warnings(category=DeprecationWarning):
+        res = oh.fit_transform(X)
+
+    expected = np.array([[1, 0, 1], [0, 1, 0], [1, 2, 3], [4, 5, 6]]).T
+    assert_array_equal(res.toarray(), expected)
 
 
 def test_one_hot_encoder_handle_unknown():
@@ -61,9 +258,8 @@ def test_one_hot_encoder_handle_unknown():
     # Test that one hot encoder raises error for unknown features
     # present during transform.
     oh = OneHotEncoder(handle_unknown='error')
-    oh.fit(X)
-    with pytest.raises(ValueError, match='Found unknown categories'):
-        oh.transform(X2)
+    assert_warns(FutureWarning, oh.fit, X)
+    assert_raises(ValueError, oh.transform, X2)
 
     # Test the ignore option, ignores unknown features (giving all 0's)
     oh = OneHotEncoder(handle_unknown='ignore')
@@ -77,8 +273,7 @@ def test_one_hot_encoder_handle_unknown():
 
     # Raise error if handle_unknown is neither ignore or error.
     oh = OneHotEncoder(handle_unknown='42')
-    with pytest.raises(ValueError, match='handle_unknown should be either'):
-        oh.fit(X)
+    assert_raises(ValueError, oh.fit, X)
 
 
 def test_one_hot_encoder_not_fitted():
@@ -90,6 +285,19 @@ def test_one_hot_encoder_not_fitted():
         enc.transform(X)
 
 
+def test_one_hot_encoder_no_categorical_features():
+    X = np.array([[3, 2, 1], [0, 1, 1]], dtype='float64')
+
+    cat = [False, False, False]
+    enc = OneHotEncoder(categorical_features=cat)
+    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
+        X_tr = enc.fit_transform(X)
+    expected_features = np.array([], dtype='object')
+    assert_array_equal(X, X_tr)
+    assert_array_equal(enc.get_feature_names(), expected_features)
+    assert enc.categories_ == []
+
+
 def test_one_hot_encoder_handle_unknown_strings():
     X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1))
     X2 = np.array(['55555', '22']).reshape((-1, 1))
@@ -138,47 +346,6 @@ def test_one_hot_encoder_dtype_pandas(output_dtype):
     assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
 
 
-def test_one_hot_encoder_feature_names():
-    enc = OneHotEncoder()
-    X = [['Male', 1, 'girl', 2, 3],
-         ['Female', 41, 'girl', 1, 10],
-         ['Male', 51, 'boy', 12, 3],
-         ['Male', 91, 'girl', 21, 30]]
-
-    enc.fit(X)
-    feature_names = enc.get_feature_names()
-    assert isinstance(feature_names, np.ndarray)
-
-    assert_array_equal(['x0_Female', 'x0_Male',
-                        'x1_1', 'x1_41', 'x1_51', 'x1_91',
-                        'x2_boy', 'x2_girl',
-                        'x3_1', 'x3_2', 'x3_12', 'x3_21',
-                        'x4_3',
-                        'x4_10', 'x4_30'], feature_names)
-
-    feature_names2 = enc.get_feature_names(['one', 'two',
-                                            'three', 'four', 'five'])
-
-    assert_array_equal(['one_Female', 'one_Male',
-                        'two_1', 'two_41', 'two_51', 'two_91',
-                        'three_boy', 'three_girl',
-                        'four_1', 'four_2', 'four_12', 'four_21',
-                        'five_3', 'five_10', 'five_30'], feature_names2)
-
-    with pytest.raises(ValueError, match="input_features should have length"):
-        enc.get_feature_names(['one', 'two'])
-
-
-def test_one_hot_encoder_feature_names_unicode():
-    enc = OneHotEncoder()
-    X = np.array([['c❤t1', 'dat2']], dtype=object).T
-    enc.fit(X)
-    feature_names = enc.get_feature_names()
-    assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names)
-    feature_names = enc.get_feature_names(input_features=['n👍me'])
-    assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names)
-
-
 def test_one_hot_encoder_set_params():
     X = np.array([[1, 2]]).T
     oh = OneHotEncoder()
@@ -220,31 +387,6 @@ def test_one_hot_encoder(X):
     assert_allclose(Xtr.toarray(), [[0, 1, 1, 0,  1], [1, 0, 0, 1, 1]])
 
 
-@pytest.mark.parametrize("X", [
-    [['def', 1, np.nan], ['abc', 2, np.nan]],
-    np.array([[10, 1, np.nan], [5, 2, np.nan]]),
-    np.array([['b', 'A', np.nan], ['a', 'B', np.nan]], dtype=object)
-    ], ids=['mixed', 'numeric', 'object'])
-def test_one_hot_encoder_with_nan(X):
-    Xtr = check_categorical_onehot(np.array(X)[:, [0]])
-    assert_allclose(Xtr, [[0, 1], [1, 0]])
-
-    Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
-    assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
-
-    Xtr = OneHotEncoder(categories='auto').fit_transform(X)
-    assert_allclose(Xtr.toarray(), [[0, 1, 1, 0,  1], [1, 0, 0, 1, 1]])
-
-def test_ohe_handle_unknow_sparse_nan():
-    # TODO
-    pass
-
-
-def test_ohe_handle_unknow_nan():
-    # TODO
-    pass
-
-
 @pytest.mark.parametrize('sparse_', [False, True])
 @pytest.mark.parametrize('drop', [None, 'first'])
 def test_one_hot_encoder_inverse(sparse_, drop):
@@ -469,14 +611,18 @@ def test_one_hot_encoder_feature_names_drop(drop, expected_names):
 @pytest.mark.parametrize("as_data_frame", [False, True],
                          ids=['array', 'dataframe'])
 @pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
-def test_one_hot_encoder_accept_nan(X, as_data_frame, handle_unknown):
+def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
     if as_data_frame:
         pd = pytest.importorskip('pandas')
         X = pd.DataFrame(X)
 
     ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
-    ohe.fit(X)
-    ohe.fit_transform(X)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit(X)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit_transform(X)
 
     if as_data_frame:
         X_partial = X.iloc[:1, :]
@@ -484,7 +630,9 @@ def test_one_hot_encoder_accept_nan(X, as_data_frame, handle_unknown):
         X_partial = X[:1, :]
 
     ohe.fit(X_partial)
-    ohe.transform(X)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.transform(X)
 
 
 @pytest.mark.parametrize("X", [
@@ -542,6 +690,24 @@ def test_ordinal_encoder_inverse():
     assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
 
 
+@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
+                               np.array([['a', np.nan]], dtype=object).T],
+                         ids=['numeric', 'object'])
+def test_ordinal_encoder_raise_missing(X):
+    ohe = OrdinalEncoder()
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit(X)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit_transform(X)
+
+    ohe.fit(X[:1, :])
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.transform(X)
+
+
 def test_ordinal_encoder_raise_categories_shape():
 
     X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
@@ -591,7 +757,7 @@ def test_encoder_dtypes_pandas():
     assert_array_equal(enc.transform(X).toarray(), exp)
 
     X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
-    X_type = [X['A'].dtype, X['B'].dtype, X['C'].dtype]
+    X_type = [int, object, float]
     enc.fit(X)
     assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
     assert_array_equal(enc.transform(X).toarray(), exp)
@@ -622,23 +788,39 @@ def test_one_hot_encoder_drop_manual():
                        enc.inverse_transform(trans))
 
 
-@pytest.mark.parametrize(
-    "X_fit, params, err_msg",
-    [([["Male"], ["Female"]], {'drop': 'second'},
-     "Wrong input for parameter `drop`"),
-     ([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
-     "`handle_unknown` must be 'error'"),
-     ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
-      {'drop': np.asarray('b', dtype=object)},
-     "Wrong input for parameter `drop`"),
-     ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
-      {'drop': ['ghi', 3, 59]},
-     "The following categories were supposed")]
-)
-def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
-    enc = OneHotEncoder(**params)
-    with pytest.raises(ValueError, match=err_msg):
-        enc.fit(X_fit)
+def test_one_hot_encoder_invalid_params():
+    enc = OneHotEncoder(drop='second')
+    assert_raises_regex(
+        ValueError,
+        "Wrong input for parameter `drop`.",
+        enc.fit, [["Male"], ["Female"]])
+
+    enc = OneHotEncoder(handle_unknown='ignore', drop='first')
+    assert_raises_regex(
+        ValueError,
+        "`handle_unknown` must be 'error'",
+        enc.fit, [["Male"], ["Female"]])
+
+    enc = OneHotEncoder(drop='first')
+    assert_raises_regex(
+        ValueError,
+        "The handling of integer data will change in version",
+        enc.fit, [[1], [2]])
+
+    enc = OneHotEncoder(drop='first', categories='auto')
+    assert_no_warnings(enc.fit_transform, [[1], [2]])
+
+    enc = OneHotEncoder(drop=np.asarray('b', dtype=object))
+    assert_raises_regex(
+        ValueError,
+        "Wrong input for parameter `drop`.",
+        enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
+
+    enc = OneHotEncoder(drop=['ghi', 3, 59])
+    assert_raises_regex(
+        ValueError,
+        "The following categories were supposed",
+        enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
 
 
 @pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']])
@@ -672,8 +854,3 @@ def test_categories(density, drop):
             assert cat_list[drop_idx] == drop_cat
     assert isinstance(ohe_test.drop_idx_, np.ndarray)
     assert ohe_test.drop_idx_.dtype == np.int_
-
-
-@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
-def test_encoders_has_categorical_tags(Encoder):
-    assert 'categorical' in Encoder()._get_tags()['X_types']
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index de5c4a7191a91..c141cd12b97fc 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -208,20 +208,30 @@ def test_label_encoder_negative_ints():
 
 
 def test_label_encode_with_nan():
-    assert len(_encode(np.asarray([np.nan, np.nan], dtype=float), allow_nan=True)) == 1
-    assert len(_encode(np.asarray([np.nan, np.nan], dtype=object), allow_nan=True)) == 1
-    assert len(_encode(np.asarray([4, np.nan]), allow_nan=True)) == 2
 
+    # encode all nan within one category
+    assert len(_encode(np.asarray([np.nan, np.nan], dtype=float),
+               allow_nan=True)) == 1
+    assert len(_encode(np.asarray([np.nan, np.nan], dtype=object),
+               allow_nan=True)) == 1
+    assert len(_encode(np.asarray([4, np.nan, np.nan]), allow_nan=True)) == 2
+
+    # the encoded size corresponds to the values size
     assert len(_encode(np.asarray([np.nan, np.nan], dtype=float),
                encode=True, allow_nan=True)[1]) == 2
     assert len(_encode(np.asarray([np.nan, np.nan], dtype=object),
                encode=True, allow_nan=True)[1]) == 2
-    assert len(_encode(np.asarray([4, np.nan, np.nan, np.nan]),
-               encode=True, allow_nan=True)[1]) == 4
+
+    encoded = _encode(np.asarray([4, 5, np.nan, np.nan, np.nan]),
+                      encode=True, allow_nan=True)[1]
+    assert_array_equal(encoded, [0, 1, 2, 2, 2])
 
 
-def test_label_encode_with_mixed_type():   
-    assert len(_encode(np.asarray([4, 'm']))) == 2
+@pytest.mark.parametrize("values",
+                         [np.asarray([np.nan, np.nan], dtype=float),
+                          np.asarray([np.nan, np.nan], dtype=object)])
+def test_label_encode_raise_nan(values):
+    assert_raises(ValueError, _encode, values, allow_nan=False)
 
 
 @pytest.mark.parametrize("dtype", ['str', 'object'])
@@ -623,7 +633,10 @@ def test_encode_util(values, expected):
     assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
 
 
-def test_encode_check_unknown():
+@pytest.mark.parametrize(
+        "allow_nan",
+        [True, False])
+def test_encode_check_unknown(allow_nan):
     # test for the check_unknown parameter of _encode()
     uniques = np.array([1, 2, 3])
     values = np.array([1, 2, 3, 4])
@@ -631,14 +644,50 @@ def test_encode_check_unknown():
     # Default is True, raise error
     with pytest.raises(ValueError,
                        match='y contains previously unseen labels'):
-        _encode(values, uniques, encode=True, check_unknown=True)
+        _encode(values, uniques, encode=True, check_unknown=True,
+                allow_nan=allow_nan)
 
     # dont raise error if False
-    _encode(values, uniques, encode=True, check_unknown=False)
+    _encode(values, uniques, encode=True, check_unknown=False,
+            allow_nan=allow_nan)
 
     # parameter is ignored for object dtype
     uniques = np.array(['a', 'b', 'c'], dtype=object)
     values = np.array(['a', 'b', 'c', 'd'], dtype=object)
     with pytest.raises(ValueError,
                        match='y contains previously unseen labels'):
-        _encode(values, uniques, encode=True, check_unknown=False)
+        _encode(values, uniques, encode=True, check_unknown=False,
+                allow_nan=allow_nan)
+
+
+@pytest.mark.parametrize(
+        "uniques, values",
+        [(np.array([1, 2, 3]),
+          np.array([1, 2, 3, np.nan])),
+         (np.array([np.nan, 2, 3]),
+          np.array([np.nan, 2, 3, 4]))])
+def test_encode_check_unknown_nan_float(uniques, values):
+    # test for the check_unknown parameter of _encode() with nan present
+
+    with pytest.raises(ValueError,
+                       match='y contains previously unseen label'):
+        _encode(values, uniques, encode=True, check_unknown=True,
+                allow_nan=True)
+
+    # dont raise error if False
+    _encode(values, uniques, encode=True, check_unknown=False, allow_nan=True)
+
+
+@pytest.mark.parametrize(
+        "uniques, values",
+        [(np.array(['a', 'b', 'c'], dtype=object),
+          np.array(['a', 'b', 'c', np.nan], dtype=object)),
+         (np.array([np.nan, 'b', 'c'], dtype=object),
+          np.array([np.nan, 'b', 'c', 'd'], dtype=object))])
+def test_encode_check_unknown_nan_object(uniques, values):
+    # test for the check_unknown parameter of _encode() with nan present
+    # parameter check_unknown is ignored for object dtype
+    with pytest.raises(ValueError,
+                       match='y contains previously unseen label'):
+        _encode(values, uniques, encode=True, check_unknown=True,
+                allow_nan=True)

From 2053fb2bb73c95525874d90e192bcb4cbfccb386 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 18 Sep 2019 11:08:39 +0200
Subject: [PATCH 04/16] iter (clean _encoders.py)

---
 sklearn/preprocessing/_encoders.py           | 534 +++----------------
 sklearn/preprocessing/label.py               |   4 +-
 sklearn/preprocessing/tests/test_encoders.py | 375 ++++---------
 3 files changed, 173 insertions(+), 740 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index c1acfbe799485..ac03659d3ef23 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -2,20 +2,14 @@
 #          Joris Van den Bossche <jorisvandenbossche@gmail.com>
 # License: BSD 3 clause
 
-import numbers
-import warnings
-
 import numpy as np
 from scipy import sparse
 
-from .. import get_config as _get_config
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
-from ..utils import deprecated
-from ..utils.fixes import _argmax, _object_dtype_isnan
+from ..utils.fixes import _argmax
 from ..utils.validation import check_is_fitted
 
-from .base import _transform_selected
 from .label import _encode, _encode_check_unknown
 
 
@@ -25,11 +19,10 @@
 ]
 
 
-class _BaseEncoder(BaseEstimator, TransformerMixin):
+class _BaseEncoder(TransformerMixin, BaseEstimator):
     """
     Base class for encoders that includes the code to categorize and
     transform the input features.
-
     """
 
     def _check_X(self, X):
@@ -42,7 +35,6 @@ def _check_X(self, X):
           constructed feature by feature to preserve the data types
           of pandas DataFrame columns, as otherwise information is lost
           and cannot be used, eg for the `categories_` attribute.
-
         """
         if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
             # if not a dataframe, do normal check_array validation
@@ -79,8 +71,8 @@ def _get_feature(self, X, feature_idx):
     def _fit(self, X, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
 
-        if self._categories != 'auto':
-            if len(self._categories) != n_features:
+        if self.categories != 'auto':
+            if len(self.categories) != n_features:
                 raise ValueError("Shape mismatch: if categories is an array,"
                                  " it has to be of shape (n_features,).")
 
@@ -88,10 +80,10 @@ def _fit(self, X, handle_unknown='error'):
 
         for i in range(n_features):
             Xi = X_list[i]
-            if self._categories == 'auto':
+            if self.categories == 'auto':
                 cats = _encode(Xi)
             else:
-                cats = np.array(self._categories[i], dtype=Xi.dtype)
+                cats = np.array(self.categories[i], dtype=Xi.dtype)
                 if Xi.dtype != object:
                     if not np.all(np.sort(cats) == cats):
                         raise ValueError("Unsorted categories are not "
@@ -110,6 +102,14 @@ def _transform(self, X, handle_unknown='error'):
         X_int = np.zeros((n_samples, n_features), dtype=np.int)
         X_mask = np.ones((n_samples, n_features), dtype=np.bool)
 
+        if n_features != len(self.categories_):
+            raise ValueError(
+                "The number of features in X is different to the number of "
+                "features of the fitted data. The fitted data had {} features "
+                "and the X has {} features."
+                .format(len(self.categories_,), n_features)
+            )
+
         for i in range(n_features):
             Xi = X_list[i]
             diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
@@ -134,66 +134,58 @@ def _transform(self, X, handle_unknown='error'):
                         Xi = Xi.copy()
 
                     Xi[~valid_mask] = self.categories_[i][0]
-            _, encoded = _encode(Xi, self.categories_[i], encode=True)
+            # We use check_unknown=False, since _encode_check_unknown was
+            # already called above.
+            _, encoded = _encode(Xi, self.categories_[i], encode=True,
+                                 check_unknown=False)
             X_int[:, i] = encoded
 
         return X_int, X_mask
 
+    def _more_tags(self):
+        return {'X_types': ['categorical']}
 
-class OneHotEncoder(_BaseEncoder):
-    """Encode categorical integer features as a one-hot numeric array.
 
+class OneHotEncoder(_BaseEncoder):
+    """Encode categorical features as a one-hot numeric array.
     The input to this transformer should be an array-like of integers or
     strings, denoting the values taken on by categorical (discrete) features.
     The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
     encoding scheme. This creates a binary column for each category and
-    returns a sparse matrix or dense array.
-
+    returns a sparse matrix or dense array (depending on the ``sparse``
+    parameter)
     By default, the encoder derives the categories based on the unique values
     in each feature. Alternatively, you can also specify the `categories`
     manually.
-    The OneHotEncoder previously assumed that the input features take on
-    values in the range [0, max(values)). This behaviour is deprecated.
-
     This encoding is needed for feeding categorical data to many scikit-learn
     estimators, notably linear models and SVMs with the standard kernels.
-
     Note: a one-hot encoding of y labels should use a LabelBinarizer
     instead.
-
     Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
-
     Parameters
     ----------
     categories : 'auto' or a list of lists/arrays of values, default='auto'.
         Categories (unique values) per feature:
-
         - 'auto' : Determine categories automatically from the training data.
         - list : ``categories[i]`` holds the categories expected in the ith
           column. The passed categories should not mix strings and numeric
           values within a single feature, and should be sorted in case of
           numeric values.
-
         The used categories can be found in the ``categories_`` attribute.
-
     drop : 'first' or a list/array of shape (n_features,), default=None.
         Specifies a methodology to use to drop one of the categories per
         feature. This is useful in situations where perfectly collinear
         features cause problems, such as when feeding the resulting data
         into a neural network or an unregularized regression.
-
         - None : retain all features (the default).
         - 'first' : drop the first category in each feature. If only one
           category is present, the feature will be dropped entirely.
         - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
           should be dropped.
-
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
-
     dtype : number type, default=np.float
         Desired dtype of output.
-
     handle_unknown : 'error' or 'ignore', default='error'.
         Whether to raise an error or ignore if an unknown categorical feature
         is present during transform (default is to raise). When this parameter
@@ -201,35 +193,6 @@ class OneHotEncoder(_BaseEncoder):
         transform, the resulting one-hot encoded columns for this feature
         will be all zeros. In the inverse transform, an unknown category
         will be denoted as None.
-
-    n_values : 'auto', int or array of ints, default='auto'
-        Number of values per feature.
-
-        - 'auto' : determine value range from training data.
-        - int : number of categorical values per feature.
-                Each feature value should be in ``range(n_values)``
-        - array : ``n_values[i]`` is the number of categorical values in
-                  ``X[:, i]``. Each feature value should be
-                  in ``range(n_values[i])``
-
-        .. deprecated:: 0.20
-            The `n_values` keyword was deprecated in version 0.20 and will
-            be removed in 0.22. Use `categories` instead.
-
-    categorical_features : 'all' or array of indices or mask, default='all'
-        Specify what features are treated as categorical.
-
-        - 'all': All features are treated as categorical.
-        - array of indices: Array of categorical feature indices.
-        - mask: Array of length n_features and with dtype=bool.
-
-        Non-categorical features are always stacked to the right of the matrix.
-
-        .. deprecated:: 0.20
-            The `categorical_features` keyword was deprecated in version
-            0.20 and will be removed in 0.22.
-            You can use the ``ColumnTransformer`` instead.
-
     Attributes
     ----------
     categories_ : list of arrays
@@ -237,52 +200,19 @@ class OneHotEncoder(_BaseEncoder):
         (in order of the features in X and corresponding with the output
         of ``transform``). This includes the category specified in ``drop``
         (if any).
-
     drop_idx_ : array of shape (n_features,)
-        ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to
+        ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to
         be dropped for each feature. None if all the transformed features will
         be retained.
-
-    active_features_ : array
-        Indices for active features, meaning values that actually occur
-        in the training set. Only available when n_values is ``'auto'``.
-
-        .. deprecated:: 0.20
-            The ``active_features_`` attribute was deprecated in version
-            0.20 and will be removed in 0.22.
-
-    feature_indices_ : array of shape (n_features,)
-        Indices to feature ranges.
-        Feature ``i`` in the original data is mapped to features
-        from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
-        (and then potentially masked by ``active_features_`` afterwards)
-
-        .. deprecated:: 0.20
-            The ``feature_indices_`` attribute was deprecated in version
-            0.20 and will be removed in 0.22.
-
-    n_values_ : array of shape (n_features,)
-        Maximum number of values per feature.
-
-        .. deprecated:: 0.20
-            The ``n_values_`` attribute was deprecated in version
-            0.20 and will be removed in 0.22.
-
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique
     values per feature and transform the data to a binary one-hot encoding.
-
     >>> from sklearn.preprocessing import OneHotEncoder
     >>> enc = OneHotEncoder(handle_unknown='ignore')
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
     >>> enc.fit(X)
-    ... # doctest: +ELLIPSIS
-    ... # doctest: +NORMALIZE_WHITESPACE
-    OneHotEncoder(categorical_features=None, categories=None, drop=None,
-       dtype=<... 'numpy.float64'>, handle_unknown='ignore',
-       n_values=None, sparse=True)
-
+    OneHotEncoder(handle_unknown='ignore')
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
     >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
@@ -299,7 +229,6 @@ class OneHotEncoder(_BaseEncoder):
     >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
     array([[0., 0., 0.],
            [1., 1., 0.]])
-
     See also
     --------
     sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer)
@@ -315,184 +244,27 @@ class OneHotEncoder(_BaseEncoder):
       matrix indicating the presence of a class label.
     """
 
-    def __init__(self, n_values=None, categorical_features=None,
-                 categories=None, drop=None, sparse=True, dtype=np.float64,
-                 handle_unknown='error'):
+    def __init__(self, categories='auto', drop=None, sparse=True,
+                 dtype=np.float64, handle_unknown='error'):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
         self.handle_unknown = handle_unknown
-        self.n_values = n_values
-        self.categorical_features = categorical_features
         self.drop = drop
 
-    # Deprecated attributes
-
-    @deprecated("The ``active_features_`` attribute was deprecated in version "
-                "0.20 and will be removed 0.22.")
-    @property
-    def active_features_(self):
-        check_is_fitted(self, 'categories_')
-        return self._active_features_
-
-    @deprecated("The ``feature_indices_`` attribute was deprecated in version "
-                "0.20 and will be removed 0.22.")
-    @property
-    def feature_indices_(self):
-        check_is_fitted(self, 'categories_')
-        return self._feature_indices_
-
-    @deprecated("The ``n_values_`` attribute was deprecated in version "
-                "0.20 and will be removed 0.22.")
-    @property
-    def n_values_(self):
-        check_is_fitted(self, 'categories_')
-        return self._n_values_
-
-    def _handle_deprecations(self, X):
-        # internal version of the attributes to handle deprecations
-        self._n_values = self.n_values
-        self._categories = getattr(self, '_categories', None)
-        self._categorical_features = getattr(self, '_categorical_features',
-                                             None)
-
-        # user manually set the categories or second fit -> never legacy mode
-        if self.categories is not None or self._categories is not None:
-            self._legacy_mode = False
-            if self.categories is not None:
-                self._categories = self.categories
-
-        # categories not set -> infer if we need legacy mode or not
-        elif self.n_values is not None and self.n_values != 'auto':
-            msg = (
-                "Passing 'n_values' is deprecated in version 0.20 and will be "
-                "removed in 0.22. You can use the 'categories' keyword "
-                "instead. 'n_values=n' corresponds to "
-                "'categories=[range(n)] * n_features'."
-            )
-            warnings.warn(msg, DeprecationWarning)
-            self._legacy_mode = True
-
-        else:  # n_values = 'auto'
-            # n_values can also be None (default to catch usage), so set
-            # _n_values to 'auto' explicitly
-            self._n_values = 'auto'
-            if self.handle_unknown == 'ignore':
-                # no change in behaviour, no need to raise deprecation warning
-                self._legacy_mode = False
-                self._categories = 'auto'
-                if self.n_values == 'auto':
-                    # user manually specified this
-                    msg = (
-                        "Passing 'n_values' is deprecated in version 0.20 and "
-                        "will be removed in 0.22. n_values='auto' can be "
-                        "replaced with categories='auto'."
-                    )
-                    warnings.warn(msg, DeprecationWarning)
-            else:
-                # check if we have integer or categorical input
-                try:
-                    check_array(X, dtype=np.int)
-                except ValueError:
-                    self._legacy_mode = False
-                    self._categories = 'auto'
-                else:
-                    if self.drop is None:
-                        msg = (
-                            "The handling of integer data will change in "
-                            "version 0.22. Currently, the categories are "
-                            "determined based on the range "
-                            "[0, max(values)], while in the future they "
-                            "will be determined based on the unique "
-                            "values.\nIf you want the future behaviour "
-                            "and silence this warning, you can specify "
-                            "\"categories='auto'\".\n"
-                            "In case you used a LabelEncoder before this "
-                            "OneHotEncoder to convert the categories to "
-                            "integers, then you can now use the "
-                            "OneHotEncoder directly."
-                        )
-                        warnings.warn(msg, FutureWarning)
-                        self._legacy_mode = True
-                    else:
-                        msg = (
-                            "The handling of integer data will change in "
-                            "version 0.22. Currently, the categories are "
-                            "determined based on the range "
-                            "[0, max(values)], while in the future they "
-                            "will be determined based on the unique "
-                            "values.\n The old behavior is not compatible "
-                            "with the `drop` parameter. Instead, you "
-                            "must manually specify \"categories='auto'\" "
-                            "if you wish to use the `drop` parameter on "
-                            "an array of entirely integer data. This will "
-                            "enable the future behavior."
-                        )
-                        raise ValueError(msg)
-
-        # if user specified categorical_features -> always use legacy mode
-        if self.categorical_features is not None:
-            if (isinstance(self.categorical_features, str)
-                    and self.categorical_features == 'all'):
-                warnings.warn(
-                    "The 'categorical_features' keyword is deprecated in "
-                    "version 0.20 and will be removed in 0.22. The passed "
-                    "value of 'all' is the default and can simply be removed.",
-                    DeprecationWarning)
-            else:
-                if self.categories is not None:
-                    raise ValueError(
-                        "The 'categorical_features' keyword is deprecated, "
-                        "and cannot be used together with specifying "
-                        "'categories'.")
-                warnings.warn(
-                    "The 'categorical_features' keyword is deprecated in "
-                    "version 0.20 and will be removed in 0.22. You can "
-                    "use the ColumnTransformer instead.", DeprecationWarning)
-                # Set categories_ to empty list if no categorical columns exist
-                n_features = X.shape[1]
-                sel = np.zeros(n_features, dtype=bool)
-                sel[np.asarray(self.categorical_features)] = True
-                if sum(sel) == 0:
-                    self.categories_ = []
-                self._legacy_mode = True
-            self._categorical_features = self.categorical_features
-        else:
-            self._categorical_features = 'all'
-
-        # Prevents new drop functionality from being used in legacy mode
-        if self._legacy_mode and self.drop is not None:
+    def _validate_keywords(self):
+        if self.handle_unknown not in ('error', 'ignore'):
+            msg = ("handle_unknown should be either 'error' or 'ignore', "
+                   "got {0}.".format(self.handle_unknown))
+            raise ValueError(msg)
+        # If we have both dropped columns and ignored unknown
+        # values, there will be ambiguous cells. This creates difficulties
+        # in interpreting the model.
+        if self.drop is not None and self.handle_unknown != 'error':
             raise ValueError(
-                "The `categorical_features` and `n_values` keywords "
-                "are deprecated, and cannot be used together "
-                "with 'drop'.")
-
-    def fit(self, X, y=None):
-        """Fit OneHotEncoder to X.
-
-        Parameters
-        ----------
-        X : array-like, shape [n_samples, n_features]
-            The data to determine the categories of each feature.
-
-        Returns
-        -------
-        self
-        """
-
-        self._validate_keywords()
-
-        self._handle_deprecations(X)
-
-        if self._legacy_mode:
-            _transform_selected(X, self._legacy_fit_transform, self.dtype,
-                                self._categorical_features,
-                                copy=True)
-            return self
-        else:
-            self._fit(X, handle_unknown=self.handle_unknown)
-            self.drop_idx_ = self._compute_drop_idx()
-            return self
+                "`handle_unknown` must be 'error' when the drop parameter is "
+                "specified, as both would create categories that are all "
+                "zero.")
 
     def _compute_drop_idx(self):
         if self.drop is None:
@@ -530,152 +302,48 @@ def _compute_drop_idx(self):
                    "'first', None or array of objects, got {}")
             raise ValueError(msg.format(type(self.drop)))
 
-    def _validate_keywords(self):
-        if self.handle_unknown not in ('error', 'ignore'):
-            msg = ("handle_unknown should be either 'error' or 'ignore', "
-                   "got {0}.".format(self.handle_unknown))
-            raise ValueError(msg)
-        # If we have both dropped columns and ignored unknown
-        # values, there will be ambiguous cells. This creates difficulties
-        # in interpreting the model.
-        if self.drop is not None and self.handle_unknown != 'error':
-            raise ValueError(
-                "`handle_unknown` must be 'error' when the drop parameter is "
-                "specified, as both would create categories that are all "
-                "zero.")
-
-    def _legacy_fit_transform(self, X):
-        """Assumes X contains only categorical features."""
-        dtype = getattr(X, 'dtype', None)
-        X = check_array(X, dtype=np.int)
-        if np.any(X < 0):
-            raise ValueError("OneHotEncoder in legacy mode cannot handle "
-                             "categories encoded as negative integers. "
-                             "Please set categories='auto' explicitly to "
-                             "be able to use arbitrary integer values as "
-                             "category identifiers.")
-        n_samples, n_features = X.shape
-        if (isinstance(self._n_values, str) and
-                self._n_values == 'auto'):
-            n_values = np.max(X, axis=0) + 1
-        elif isinstance(self._n_values, numbers.Integral):
-            if (np.max(X, axis=0) >= self._n_values).any():
-                raise ValueError("Feature out of bounds for n_values=%d"
-                                 % self._n_values)
-            n_values = np.empty(n_features, dtype=np.int)
-            n_values.fill(self._n_values)
-        else:
-            try:
-                n_values = np.asarray(self._n_values, dtype=int)
-            except (ValueError, TypeError):
-                raise TypeError("Wrong type for parameter `n_values`. Expected"
-                                " 'auto', int or array of ints, got %r"
-                                % type(self._n_values))
-            if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
-                raise ValueError("Shape mismatch: if n_values is an array,"
-                                 " it has to be of shape (n_features,).")
-
-        self._n_values_ = n_values
-        self.categories_ = [np.arange(n_val - 1, dtype=dtype)
-                            for n_val in n_values]
-        n_values = np.hstack([[0], n_values])
-        indices = np.cumsum(n_values)
-        self._feature_indices_ = indices
-
-        column_indices = (X + indices[:-1]).ravel()
-        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
-                                n_features)
-        data = np.ones(n_samples * n_features)
-        out = sparse.coo_matrix((data, (row_indices, column_indices)),
-                                shape=(n_samples, indices[-1]),
-                                dtype=self.dtype).tocsr()
-
-        if (isinstance(self._n_values, str) and
-                self._n_values == 'auto'):
-            mask = np.array(out.sum(axis=0)).ravel() != 0
-            active_features = np.where(mask)[0]
-            out = out[:, active_features]
-            self._active_features_ = active_features
-
-            self.categories_ = [
-                np.unique(X[:, i]).astype(dtype) if dtype
-                else np.unique(X[:, i]) for i in range(n_features)]
-
-        return out if self.sparse else out.toarray()
+    def fit(self, X, y=None):
+        """Fit OneHotEncoder to X.
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data to determine the categories of each feature.
+        Returns
+        -------
+        self
+        """
+        self._validate_keywords()
+        self._fit(X, handle_unknown=self.handle_unknown)
+        self.drop_idx_ = self._compute_drop_idx()
+        return self
 
     def fit_transform(self, X, y=None):
         """Fit OneHotEncoder to X, then transform X.
-
         Equivalent to fit(X).transform(X) but more convenient.
-
         Parameters
         ----------
         X : array-like, shape [n_samples, n_features]
             The data to encode.
-
         Returns
         -------
         X_out : sparse matrix if sparse=True else a 2-d array
             Transformed input.
         """
-
         self._validate_keywords()
+        return super().fit_transform(X, y)
 
-        self._handle_deprecations(X)
-
-        if self._legacy_mode:
-            return _transform_selected(
-                X, self._legacy_fit_transform, self.dtype,
-                self._categorical_features, copy=True)
-        else:
-            return self.fit(X).transform(X)
-
-    def _legacy_transform(self, X):
-        """Assumes X contains only categorical features."""
-        X = check_array(X, dtype=np.int)
-        if np.any(X < 0):
-            raise ValueError("OneHotEncoder in legacy mode cannot handle "
-                             "categories encoded as negative integers. "
-                             "Please set categories='auto' explicitly to "
-                             "be able to use arbitrary integer values as "
-                             "category identifiers.")
-        n_samples, n_features = X.shape
-
-        indices = self._feature_indices_
-        if n_features != indices.shape[0] - 1:
-            raise ValueError("X has different shape than during fitting."
-                             " Expected %d, got %d."
-                             % (indices.shape[0] - 1, n_features))
-
-        # We use only those categorical features of X that are known using fit.
-        # i.e lesser than n_values_ using mask.
-        # This means, if self.handle_unknown is "ignore", the row_indices and
-        # col_indices corresponding to the unknown categorical feature are
-        # ignored.
-        mask = (X < self._n_values_).ravel()
-        if np.any(~mask):
-            if self.handle_unknown not in ['error', 'ignore']:
-                raise ValueError("handle_unknown should be either error or "
-                                 "unknown got %s" % self.handle_unknown)
-            if self.handle_unknown == 'error':
-                raise ValueError("unknown categorical feature present %s "
-                                 "during transform." % X.ravel()[~mask])
-
-        column_indices = (X + indices[:-1]).ravel()[mask]
-        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
-                                n_features)[mask]
-        data = np.ones(np.sum(mask))
-        out = sparse.coo_matrix((data, (row_indices, column_indices)),
-                                shape=(n_samples, indices[-1]),
-                                dtype=self.dtype).tocsr()
-        if (isinstance(self._n_values, str) and
-                self._n_values == 'auto'):
-            out = out[:, self._active_features_]
-
-        return out if self.sparse else out.toarray()
-
-    def _transform_new(self, X):
-        """New implementation assuming categorical input"""
+    def transform(self, X):
+        """Transform X using one-hot encoding.
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data to encode.
+        Returns
+        -------
+        X_out : sparse matrix if sparse=True else a 2-d array
+            Transformed input.
+        """
+        check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
 
@@ -710,48 +378,20 @@ def _transform_new(self, X):
         else:
             return out
 
-    def transform(self, X):
-        """Transform X using one-hot encoding.
-
-        Parameters
-        ----------
-        X : array-like, shape [n_samples, n_features]
-            The data to encode.
-
-        Returns
-        -------
-        X_out : sparse matrix if sparse=True else a 2-d array
-            Transformed input.
-        """
-        check_is_fitted(self, 'categories_')
-        if self._legacy_mode:
-            return _transform_selected(X, self._legacy_transform, self.dtype,
-                                       self._categorical_features,
-                                       copy=True)
-        else:
-            return self._transform_new(X)
-
     def inverse_transform(self, X):
         """Convert the back data to the original representation.
-
         In case unknown categories are encountered (all zeros in the
         one-hot encoding), ``None`` is used to represent this category.
-
         Parameters
         ----------
         X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
             The transformed data.
-
         Returns
         -------
         X_tr : array-like, shape [n_samples, n_features]
             Inverse transformed array.
-
         """
-        # if self._legacy_mode:
-        #     raise ValueError("only supported for categorical features")
-
-        check_is_fitted(self, 'categories_')
+        check_is_fitted(self)
         X = check_array(X, accept_sparse='csr')
 
         n_samples, _ = X.shape
@@ -822,19 +462,16 @@ def inverse_transform(self, X):
 
     def get_feature_names(self, input_features=None):
         """Return feature names for output features.
-
         Parameters
         ----------
         input_features : list of string, length n_features, optional
             String names for input features if available. By default,
             "x0", "x1", ... "xn_features" is used.
-
         Returns
         -------
         output_feature_names : array of string, length n_output_features
-
         """
-        check_is_fitted(self, 'categories_')
+        check_is_fitted(self)
         cats = self.categories_
         if input_features is None:
             input_features = ['x%d' % i for i in range(len(cats))]
@@ -857,57 +494,45 @@ def get_feature_names(self, input_features=None):
 
 class OrdinalEncoder(_BaseEncoder):
     """Encode categorical features as an integer array.
-
     The input to this transformer should be an array-like of integers or
     strings, denoting the values taken on by categorical (discrete) features.
     The features are converted to ordinal integers. This results in
     a single column of integers (0 to n_categories - 1) per feature.
-
     Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
-
     Parameters
     ----------
     categories : 'auto' or a list of lists/arrays of values.
         Categories (unique values) per feature:
-
         - 'auto' : Determine categories automatically from the training data.
         - list : ``categories[i]`` holds the categories expected in the ith
           column. The passed categories should not mix strings and numeric
           values, and should be sorted in case of numeric values.
-
         The used categories can be found in the ``categories_`` attribute.
-
     dtype : number type, default np.float64
         Desired dtype of output.
-
     Attributes
     ----------
     categories_ : list of arrays
         The categories of each feature determined during fitting
         (in order of the features in X and corresponding with the output
         of ``transform``).
-
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique
     values per feature and transform the data to an ordinal encoding.
-
     >>> from sklearn.preprocessing import OrdinalEncoder
     >>> enc = OrdinalEncoder()
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
     >>> enc.fit(X)
-    ... # doctest: +ELLIPSIS
-    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
+    OrdinalEncoder()
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
     >>> enc.transform([['Female', 3], ['Male', 1]])
     array([[0., 2.],
            [1., 0.]])
-
     >>> enc.inverse_transform([[1, 0], [0, 1]])
     array([['Male', 1],
            ['Female', 2]], dtype=object)
-
     See also
     --------
     sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
@@ -922,56 +547,44 @@ def __init__(self, categories='auto', dtype=np.float64):
 
     def fit(self, X, y=None):
         """Fit the OrdinalEncoder to X.
-
         Parameters
         ----------
         X : array-like, shape [n_samples, n_features]
             The data to determine the categories of each feature.
-
         Returns
         -------
         self
-
         """
-        # base classes uses _categories to deal with deprecations in
-        # OneHoteEncoder: can be removed once deprecations are removed
-        self._categories = self.categories
         self._fit(X)
 
         return self
 
     def transform(self, X):
         """Transform X to ordinal codes.
-
         Parameters
         ----------
         X : array-like, shape [n_samples, n_features]
             The data to encode.
-
         Returns
         -------
         X_out : sparse matrix or a 2-d array
             Transformed input.
-
         """
         X_int, _ = self._transform(X)
         return X_int.astype(self.dtype, copy=False)
 
     def inverse_transform(self, X):
         """Convert the data back to the original representation.
-
         Parameters
         ----------
         X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
             The transformed data.
-
         Returns
         -------
         X_tr : array-like, shape [n_samples, n_features]
             Inverse transformed array.
-
         """
-        check_is_fitted(self, 'categories_')
+        check_is_fitted(self)
         X = check_array(X, accept_sparse='csr')
 
         n_samples, _ = X.shape
@@ -991,7 +604,4 @@ def inverse_transform(self, X):
             labels = X[:, i].astype('int64', copy=False)
             X_tr[:, i] = self.categories_[i][labels]
 
-        return X_tr
-
-    def _more_tags(self):
-        return {'X_types': ['categorical']}
+        return X_tr
\ No newline at end of file
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 3adf40cc519b2..5e8d39b25cdb7 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -205,7 +205,7 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
         diff = np.array(list(set(values) - uniques_set))
         # set([np.nan]) - set([np.nan]) returns set()
         # but set(np.array([np.nan])) - set(np.array([np.nan])) return {nan}
-        if diff and any(_get_mask(diff, np.nan)):
+        if len(diff) and any(_get_mask(diff, np.nan)):
             if not allow_nan:
                 raise ValueError('Nan found during check_unknown')
             else:
@@ -213,7 +213,7 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
                    any(_get_mask(set(values), np.nan)):
                     diff = diff[~_get_mask(diff, np.nan)]
         if return_mask:
-            if diff:
+            if len(diff):
                 valid_mask = np.array([val in uniques_set for val in values])
             else:
                 valid_mask = np.ones(len(values), dtype=bool)
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 29cd6602e4f10..4804bd03ed6b8 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -8,14 +8,7 @@
 
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_allclose
-from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.testing import assert_warns
-from sklearn.utils.testing import assert_warns_message
-from sklearn.utils.testing import assert_no_warnings
 
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
@@ -27,228 +20,37 @@ def toarray(a):
     return a
 
 
-def test_one_hot_encoder_sparse():
-    # Test OneHotEncoder's fit and transform.
-    X = [[3, 2, 1], [0, 1, 1]]
-    enc = OneHotEncoder()
-    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
-        # discover max values automatically
-        X_trans = enc.fit_transform(X).toarray()
-        assert_equal(X_trans.shape, (2, 5))
-        assert_array_equal(enc.active_features_,
-                           np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
-        assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
-
-        # check outcome
-        assert_array_equal(X_trans,
-                           [[0., 1., 0., 1., 1.],
-                            [1., 0., 1., 0., 1.]])
-
-    # max value given as 3
-    # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=4)
-    enc = OneHotEncoder(n_values=4)
-    with ignore_warnings(category=DeprecationWarning):
-        X_trans = enc.fit_transform(X)
-        assert_equal(X_trans.shape, (2, 4 * 3))
-        assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])
-
-    # max value given per feature
-    # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=[3, 2, 2])
-    enc = OneHotEncoder(n_values=[3, 2, 2])
-    with ignore_warnings(category=DeprecationWarning):
-        X = [[1, 0, 1], [0, 1, 1]]
-        X_trans = enc.fit_transform(X)
-        assert_equal(X_trans.shape, (2, 3 + 2 + 2))
-        assert_array_equal(enc.n_values_, [3, 2, 2])
-    # check that testing with larger feature works:
-    X = np.array([[2, 0, 1], [0, 1, 1]])
-    enc.transform(X)
-
-    # test that an error is raised when out of bounds:
-    X_too_large = [[0, 2, 1], [0, 1, 1]]
-    assert_raises(ValueError, enc.transform, X_too_large)
-    error_msg = r"unknown categorical feature present \[2\] during transform"
-    assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large)
-    with ignore_warnings(category=DeprecationWarning):
-        assert_raises(
-            ValueError,
-            OneHotEncoder(n_values=2).fit_transform, X)
-
-    # test that error is raised when wrong number of features
-    assert_raises(ValueError, enc.transform, X[:, :-1])
-
-    # test that error is raised when wrong number of features in fit
-    # with prespecified n_values
-    with ignore_warnings(category=DeprecationWarning):
-        assert_raises(ValueError, enc.fit, X[:, :-1])
-    # test exception on wrong init param
-    with ignore_warnings(category=DeprecationWarning):
-        assert_raises(
-            TypeError, OneHotEncoder(n_values=np.int).fit, X)
+def test_one_hot_encoder_sparse_dense():
+    # check that sparse and dense will give the same results
 
-    enc = OneHotEncoder()
-    # test negative input to fit
-    with ignore_warnings(category=FutureWarning):
-        assert_raises(ValueError, enc.fit, [[0], [-1]])
-
-    # test negative input to transform
-    with ignore_warnings(category=FutureWarning):
-        enc.fit([[0], [1]])
-    assert_raises(ValueError, enc.transform, [[0], [-1]])
-
-    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
-        enc = OneHotEncoder(drop='first', n_values=1)
-        for method in (enc.fit, enc.fit_transform):
-            assert_raises_regex(
-                ValueError,
-                'The `categorical_features` and `n_values` keywords ',
-                method, [[0], [-1]])
-
-            enc = OneHotEncoder(drop='first', categorical_features='all')
-            assert_raises_regex(
-                ValueError,
-                'The `categorical_features` and `n_values` keywords ',
-                method, [[0], [-1]])
-
-
-def test_one_hot_encoder_dense():
-    # check for sparse=False
-    X = [[3, 2, 1], [0, 1, 1]]
-    enc = OneHotEncoder(sparse=False)
-    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
-        # discover max values automatically
-        X_trans = enc.fit_transform(X)
-        assert_equal(X_trans.shape, (2, 5))
-        assert_array_equal(enc.active_features_,
-                           np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
-        assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    enc_sparse = OneHotEncoder()
+    enc_dense = OneHotEncoder(sparse=False)
 
-    # check outcome
-    assert_array_equal(X_trans,
-                       np.array([[0., 1., 0., 1., 1.],
-                                 [1., 0., 1., 0., 1.]]))
-
-
-def test_one_hot_encoder_deprecationwarnings():
-    for X in [[[3, 2, 1], [0, 1, 1]],
-              [[3., 2., 1.], [0., 1., 1.]]]:
-        enc = OneHotEncoder()
-        assert_warns_message(FutureWarning, "handling of integer",
-                             enc.fit, X)
-        enc = OneHotEncoder()
-        assert_warns_message(FutureWarning, "handling of integer",
-                             enc.fit_transform, X)
-
-        # check it still works correctly as well
-        with ignore_warnings(category=FutureWarning):
-            X_trans = enc.fit_transform(X).toarray()
-        res = [[0., 1., 0., 1., 1.],
-               [1., 0., 1., 0., 1.]]
-        assert_array_equal(X_trans, res)
-
-        # check deprecated attributes
-        assert_warns(DeprecationWarning, lambda: enc.active_features_)
-        assert_warns(DeprecationWarning, lambda: enc.feature_indices_)
-        assert_warns(DeprecationWarning, lambda: enc.n_values_)
-
-        # check no warning is raised if keyword is specified
-        enc = OneHotEncoder(categories='auto')
-        assert_no_warnings(enc.fit, X)
-        enc = OneHotEncoder(categories='auto')
-        assert_no_warnings(enc.fit_transform, X)
-        X_trans = enc.fit_transform(X).toarray()
-        assert_array_equal(X_trans, res)
+    X_trans_sparse = enc_sparse.fit_transform(X)
+    X_trans_dense = enc_dense.fit_transform(X)
 
-        # check there is also a warning if the default is passed
-        enc = OneHotEncoder(n_values='auto', handle_unknown='ignore')
-        assert_warns(DeprecationWarning, enc.fit, X)
+    assert X_trans_sparse.shape == (2, 5)
+    assert X_trans_dense.shape == (2, 5)
 
-    X = np.array([['cat1', 'cat2']], dtype=object).T
-    enc = OneHotEncoder(categorical_features='all')
-    assert_warns(DeprecationWarning, enc.fit, X)
+    assert sparse.issparse(X_trans_sparse)
+    assert not sparse.issparse(X_trans_dense)
 
+    # check outcome
+    assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.],
+                                                  [1., 0., 1., 0., 1.]])
+    assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
 
-def test_one_hot_encoder_force_new_behaviour():
-    # ambiguous integer case (non secutive range of categories)
-    X = np.array([[1, 2]]).T
-    X2 = np.array([[0, 1]]).T
 
-    # without argument -> by default using legacy behaviour with warnings
+def test_one_hot_encoder_diff_n_features():
+    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
+    X2 = np.array([[1, 0]])
     enc = OneHotEncoder()
-
-    with ignore_warnings(category=FutureWarning):
-        enc.fit(X)
-
-    res = enc.transform(X2)
-    exp = np.array([[0, 0], [1, 0]])
-    assert_array_equal(res.toarray(), exp)
-
-    # with explicit auto argument -> don't use legacy behaviour
-    # (so will raise an error on unseen value within range)
-    enc = OneHotEncoder(categories='auto')
     enc.fit(X)
-    assert_raises(ValueError, enc.transform, X2)
-
-
-def _run_one_hot(X, X2, cat):
-    # enc = assert_warns(
-    #     DeprecationWarning,
-    #     OneHotEncoder, categorical_features=cat)
-    enc = OneHotEncoder(categorical_features=cat)
-    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
-        Xtr = enc.fit_transform(X)
-    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
-        X2tr = enc.fit(X).transform(X2)
-    return Xtr, X2tr
-
-
-def _check_one_hot(X, X2, cat, n_features):
-    ind = np.where(cat)[0]
-    # With mask
-    A, B = _run_one_hot(X, X2, cat)
-    # With indices
-    C, D = _run_one_hot(X, X2, ind)
-    # Check shape
-    assert_equal(A.shape, (2, n_features))
-    assert_equal(B.shape, (1, n_features))
-    assert_equal(C.shape, (2, n_features))
-    assert_equal(D.shape, (1, n_features))
-    # Check that mask and indices give the same results
-    assert_array_equal(toarray(A), toarray(C))
-    assert_array_equal(toarray(B), toarray(D))
-
-
-def test_one_hot_encoder_categorical_features():
-    X = np.array([[3, 2, 1], [0, 1, 1]])
-    X2 = np.array([[1, 1, 1]])
-
-    cat = [True, False, False]
-    _check_one_hot(X, X2, cat, 4)
-
-    # Edge case: all non-categorical
-    cat = [False, False, False]
-    _check_one_hot(X, X2, cat, 3)
-
-    # Edge case: all categorical
-    cat = [True, True, True]
-    _check_one_hot(X, X2, cat, 5)
-
-    # check error raised if also specifying categories
-    oh = OneHotEncoder(categories=[range(3)],
-                       categorical_features=[True, False, False])
-    assert_raises(ValueError, oh.fit, X)
-
-
-def test_one_hot_encoder_categorical_features_ignore_unknown():
-    # GH12881 bug in combination of categorical_features with ignore
-    X = np.array([[1, 2, 3], [4, 5, 6], [2, 3, 2]]).T
-    oh = OneHotEncoder(categorical_features=[2], handle_unknown='ignore')
-
-    with ignore_warnings(category=DeprecationWarning):
-        res = oh.fit_transform(X)
-
-    expected = np.array([[1, 0, 1], [0, 1, 0], [1, 2, 3], [4, 5, 6]]).T
-    assert_array_equal(res.toarray(), expected)
+    err_msg = ("The number of features in X is different to the number of "
+               "features of the fitted data.")
+    with pytest.raises(ValueError, match=err_msg):
+        enc.transform(X2)
 
 
 def test_one_hot_encoder_handle_unknown():
@@ -258,8 +60,9 @@ def test_one_hot_encoder_handle_unknown():
     # Test that one hot encoder raises error for unknown features
     # present during transform.
     oh = OneHotEncoder(handle_unknown='error')
-    assert_warns(FutureWarning, oh.fit, X)
-    assert_raises(ValueError, oh.transform, X2)
+    oh.fit(X)
+    with pytest.raises(ValueError, match='Found unknown categories'):
+        oh.transform(X2)
 
     # Test the ignore option, ignores unknown features (giving all 0's)
     oh = OneHotEncoder(handle_unknown='ignore')
@@ -273,7 +76,8 @@ def test_one_hot_encoder_handle_unknown():
 
     # Raise error if handle_unknown is neither ignore or error.
     oh = OneHotEncoder(handle_unknown='42')
-    assert_raises(ValueError, oh.fit, X)
+    with pytest.raises(ValueError, match='handle_unknown should be either'):
+        oh.fit(X)
 
 
 def test_one_hot_encoder_not_fitted():
@@ -285,19 +89,6 @@ def test_one_hot_encoder_not_fitted():
         enc.transform(X)
 
 
-def test_one_hot_encoder_no_categorical_features():
-    X = np.array([[3, 2, 1], [0, 1, 1]], dtype='float64')
-
-    cat = [False, False, False]
-    enc = OneHotEncoder(categorical_features=cat)
-    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
-        X_tr = enc.fit_transform(X)
-    expected_features = np.array([], dtype='object')
-    assert_array_equal(X, X_tr)
-    assert_array_equal(enc.get_feature_names(), expected_features)
-    assert enc.categories_ == []
-
-
 def test_one_hot_encoder_handle_unknown_strings():
     X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1))
     X2 = np.array(['55555', '22']).reshape((-1, 1))
@@ -346,6 +137,47 @@ def test_one_hot_encoder_dtype_pandas(output_dtype):
     assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
 
 
+def test_one_hot_encoder_feature_names():
+    enc = OneHotEncoder()
+    X = [['Male', 1, 'girl', 2, 3],
+         ['Female', 41, 'girl', 1, 10],
+         ['Male', 51, 'boy', 12, 3],
+         ['Male', 91, 'girl', 21, 30]]
+
+    enc.fit(X)
+    feature_names = enc.get_feature_names()
+    assert isinstance(feature_names, np.ndarray)
+
+    assert_array_equal(['x0_Female', 'x0_Male',
+                        'x1_1', 'x1_41', 'x1_51', 'x1_91',
+                        'x2_boy', 'x2_girl',
+                        'x3_1', 'x3_2', 'x3_12', 'x3_21',
+                        'x4_3',
+                        'x4_10', 'x4_30'], feature_names)
+
+    feature_names2 = enc.get_feature_names(['one', 'two',
+                                            'three', 'four', 'five'])
+
+    assert_array_equal(['one_Female', 'one_Male',
+                        'two_1', 'two_41', 'two_51', 'two_91',
+                        'three_boy', 'three_girl',
+                        'four_1', 'four_2', 'four_12', 'four_21',
+                        'five_3', 'five_10', 'five_30'], feature_names2)
+
+    with pytest.raises(ValueError, match="input_features should have length"):
+        enc.get_feature_names(['one', 'two'])
+
+
+def test_one_hot_encoder_feature_names_unicode():
+    enc = OneHotEncoder()
+    X = np.array([['c❤t1', 'dat2']], dtype=object).T
+    enc.fit(X)
+    feature_names = enc.get_feature_names()
+    assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names)
+    feature_names = enc.get_feature_names(input_features=['n👍me'])
+    assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names)
+
+
 def test_one_hot_encoder_set_params():
     X = np.array([[1, 2]]).T
     oh = OneHotEncoder()
@@ -428,7 +260,8 @@ def test_one_hot_encoder_inverse(sparse_, drop):
     # incorrect shape raises
     X_tr = np.array([[0, 1, 1], [1, 0, 1]])
     msg = re.escape('Shape of the passed X data is not correct')
-    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
+    with pytest.raises(ValueError, match=msg):
+        enc.inverse_transform(X_tr)
 
 
 @pytest.mark.parametrize("method", ['fit', 'fit_transform'])
@@ -687,7 +520,8 @@ def test_ordinal_encoder_inverse():
     # incorrect shape raises
     X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
     msg = re.escape('Shape of the passed X data is not correct')
-    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
+    with pytest.raises(ValueError, match=msg):
+        enc.inverse_transform(X_tr)
 
 
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
@@ -718,6 +552,7 @@ def test_ordinal_encoder_raise_categories_shape():
     with pytest.raises(ValueError, match=msg):
         enc.fit(X)
 
+
 def test_encoder_dtypes():
     # check that dtypes are preserved when determining categories
     enc = OneHotEncoder(categories='auto')
@@ -757,7 +592,7 @@ def test_encoder_dtypes_pandas():
     assert_array_equal(enc.transform(X).toarray(), exp)
 
     X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
-    X_type = [int, object, float]
+    X_type = [X['A'].dtype, X['B'].dtype, X['C'].dtype]
     enc.fit(X)
     assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
     assert_array_equal(enc.transform(X).toarray(), exp)
@@ -788,48 +623,31 @@ def test_one_hot_encoder_drop_manual():
                        enc.inverse_transform(trans))
 
 
-def test_one_hot_encoder_invalid_params():
-    enc = OneHotEncoder(drop='second')
-    assert_raises_regex(
-        ValueError,
-        "Wrong input for parameter `drop`.",
-        enc.fit, [["Male"], ["Female"]])
-
-    enc = OneHotEncoder(handle_unknown='ignore', drop='first')
-    assert_raises_regex(
-        ValueError,
-        "`handle_unknown` must be 'error'",
-        enc.fit, [["Male"], ["Female"]])
-
-    enc = OneHotEncoder(drop='first')
-    assert_raises_regex(
-        ValueError,
-        "The handling of integer data will change in version",
-        enc.fit, [[1], [2]])
-
-    enc = OneHotEncoder(drop='first', categories='auto')
-    assert_no_warnings(enc.fit_transform, [[1], [2]])
-
-    enc = OneHotEncoder(drop=np.asarray('b', dtype=object))
-    assert_raises_regex(
-        ValueError,
-        "Wrong input for parameter `drop`.",
-        enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
-
-    enc = OneHotEncoder(drop=['ghi', 3, 59])
-    assert_raises_regex(
-        ValueError,
-        "The following categories were supposed",
-        enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
+@pytest.mark.parametrize(
+    "X_fit, params, err_msg",
+    [([["Male"], ["Female"]], {'drop': 'second'},
+     "Wrong input for parameter `drop`"),
+     ([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
+     "`handle_unknown` must be 'error'"),
+     ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
+      {'drop': np.asarray('b', dtype=object)},
+     "Wrong input for parameter `drop`"),
+     ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
+      {'drop': ['ghi', 3, 59]},
+     "The following categories were supposed")]
+)
+def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
+    enc = OneHotEncoder(**params)
+    with pytest.raises(ValueError, match=err_msg):
+        enc.fit(X_fit)
 
 
 @pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']])
 def test_invalid_drop_length(drop):
     enc = OneHotEncoder(drop=drop)
-    assert_raises_regex(
-        ValueError,
-        "`drop` should have length equal to the number",
-        enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
+    err_msg = "`drop` should have length equal to the number"
+    with pytest.raises(ValueError, match=err_msg):
+        enc.fit([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
 
 
 @pytest.mark.parametrize("density", [True, False],
@@ -854,3 +672,8 @@ def test_categories(density, drop):
             assert cat_list[drop_idx] == drop_cat
     assert isinstance(ohe_test.drop_idx_, np.ndarray)
     assert ohe_test.drop_idx_.dtype == np.int_
+
+
+@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
+def test_encoders_has_categorical_tags(Encoder):
+    assert 'categorical' in Encoder()._get_tags()['X_types']
\ No newline at end of file

From 2d71efa02606725aeb955799a1a7469df3e1a587 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 18 Sep 2019 11:17:17 +0200
Subject: [PATCH 05/16] clean _encoders andtest_encoders

---
 sklearn/preprocessing/_encoders.py           | 61 +++++++++++++++++++-
 sklearn/preprocessing/tests/test_encoders.py | 17 +++---
 2 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index ac03659d3ef23..c33744204fc36 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -19,10 +19,11 @@
 ]
 
 
-class _BaseEncoder(TransformerMixin, BaseEstimator):
+class _BaseEncoder(BaseEstimator, TransformerMixin):
     """
     Base class for encoders that includes the code to categorize and
     transform the input features.
+
     """
 
     def _check_X(self, X):
@@ -35,6 +36,7 @@ def _check_X(self, X):
           constructed feature by feature to preserve the data types
           of pandas DataFrame columns, as otherwise information is lost
           and cannot be used, eg for the `categories_` attribute.
+
         """
         if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
             # if not a dataframe, do normal check_array validation
@@ -148,44 +150,57 @@ def _more_tags(self):
 
 class OneHotEncoder(_BaseEncoder):
     """Encode categorical features as a one-hot numeric array.
+
     The input to this transformer should be an array-like of integers or
     strings, denoting the values taken on by categorical (discrete) features.
     The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
     encoding scheme. This creates a binary column for each category and
     returns a sparse matrix or dense array (depending on the ``sparse``
     parameter)
+
     By default, the encoder derives the categories based on the unique values
     in each feature. Alternatively, you can also specify the `categories`
     manually.
+
     This encoding is needed for feeding categorical data to many scikit-learn
     estimators, notably linear models and SVMs with the standard kernels.
+
     Note: a one-hot encoding of y labels should use a LabelBinarizer
     instead.
+
     Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+
     Parameters
     ----------
     categories : 'auto' or a list of lists/arrays of values, default='auto'.
         Categories (unique values) per feature:
+
         - 'auto' : Determine categories automatically from the training data.
         - list : ``categories[i]`` holds the categories expected in the ith
           column. The passed categories should not mix strings and numeric
           values within a single feature, and should be sorted in case of
           numeric values.
+
         The used categories can be found in the ``categories_`` attribute.
+
     drop : 'first' or a list/array of shape (n_features,), default=None.
         Specifies a methodology to use to drop one of the categories per
         feature. This is useful in situations where perfectly collinear
         features cause problems, such as when feeding the resulting data
         into a neural network or an unregularized regression.
+
         - None : retain all features (the default).
         - 'first' : drop the first category in each feature. If only one
           category is present, the feature will be dropped entirely.
         - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
           should be dropped.
+
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
+
     dtype : number type, default=np.float
         Desired dtype of output.
+
     handle_unknown : 'error' or 'ignore', default='error'.
         Whether to raise an error or ignore if an unknown categorical feature
         is present during transform (default is to raise). When this parameter
@@ -193,6 +208,7 @@ class OneHotEncoder(_BaseEncoder):
         transform, the resulting one-hot encoded columns for this feature
         will be all zeros. In the inverse transform, an unknown category
         will be denoted as None.
+
     Attributes
     ----------
     categories_ : list of arrays
@@ -200,19 +216,23 @@ class OneHotEncoder(_BaseEncoder):
         (in order of the features in X and corresponding with the output
         of ``transform``). This includes the category specified in ``drop``
         (if any).
+
     drop_idx_ : array of shape (n_features,)
-        ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to
+        ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to
         be dropped for each feature. None if all the transformed features will
         be retained.
+
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique
     values per feature and transform the data to a binary one-hot encoding.
+
     >>> from sklearn.preprocessing import OneHotEncoder
     >>> enc = OneHotEncoder(handle_unknown='ignore')
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
     >>> enc.fit(X)
     OneHotEncoder(handle_unknown='ignore')
+
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
     >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
@@ -229,6 +249,7 @@ class OneHotEncoder(_BaseEncoder):
     >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
     array([[0., 0., 0.],
            [1., 1., 0.]])
+
     See also
     --------
     sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer)
@@ -304,10 +325,12 @@ def _compute_drop_idx(self):
 
     def fit(self, X, y=None):
         """Fit OneHotEncoder to X.
+
         Parameters
         ----------
         X : array-like, shape [n_samples, n_features]
             The data to determine the categories of each feature.
+
         Returns
         -------
         self
@@ -319,11 +342,14 @@ def fit(self, X, y=None):
 
     def fit_transform(self, X, y=None):
         """Fit OneHotEncoder to X, then transform X.
+
         Equivalent to fit(X).transform(X) but more convenient.
+
         Parameters
         ----------
         X : array-like, shape [n_samples, n_features]
             The data to encode.
+
         Returns
         -------
         X_out : sparse matrix if sparse=True else a 2-d array
@@ -334,10 +360,12 @@ def fit_transform(self, X, y=None):
 
     def transform(self, X):
         """Transform X using one-hot encoding.
+
         Parameters
         ----------
         X : array-like, shape [n_samples, n_features]
             The data to encode.
+
         Returns
         -------
         X_out : sparse matrix if sparse=True else a 2-d array
@@ -380,16 +408,20 @@ def transform(self, X):
 
     def inverse_transform(self, X):
         """Convert the back data to the original representation.
+
         In case unknown categories are encountered (all zeros in the
         one-hot encoding), ``None`` is used to represent this category.
+
         Parameters
         ----------
         X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
             The transformed data.
+
         Returns
         -------
         X_tr : array-like, shape [n_samples, n_features]
             Inverse transformed array.
+
         """
         check_is_fitted(self)
         X = check_array(X, accept_sparse='csr')
@@ -462,14 +494,17 @@ def inverse_transform(self, X):
 
     def get_feature_names(self, input_features=None):
         """Return feature names for output features.
+
         Parameters
         ----------
         input_features : list of string, length n_features, optional
             String names for input features if available. By default,
             "x0", "x1", ... "xn_features" is used.
+
         Returns
         -------
         output_feature_names : array of string, length n_output_features
+
         """
         check_is_fitted(self)
         cats = self.categories_
@@ -494,32 +529,41 @@ def get_feature_names(self, input_features=None):
 
 class OrdinalEncoder(_BaseEncoder):
     """Encode categorical features as an integer array.
+
     The input to this transformer should be an array-like of integers or
     strings, denoting the values taken on by categorical (discrete) features.
     The features are converted to ordinal integers. This results in
     a single column of integers (0 to n_categories - 1) per feature.
+
     Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+
     Parameters
     ----------
     categories : 'auto' or a list of lists/arrays of values.
         Categories (unique values) per feature:
+
         - 'auto' : Determine categories automatically from the training data.
         - list : ``categories[i]`` holds the categories expected in the ith
           column. The passed categories should not mix strings and numeric
           values, and should be sorted in case of numeric values.
+
         The used categories can be found in the ``categories_`` attribute.
+
     dtype : number type, default np.float64
         Desired dtype of output.
+
     Attributes
     ----------
     categories_ : list of arrays
         The categories of each feature determined during fitting
         (in order of the features in X and corresponding with the output
         of ``transform``).
+
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique
     values per feature and transform the data to an ordinal encoding.
+
     >>> from sklearn.preprocessing import OrdinalEncoder
     >>> enc = OrdinalEncoder()
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
@@ -530,9 +574,11 @@ class OrdinalEncoder(_BaseEncoder):
     >>> enc.transform([['Female', 3], ['Male', 1]])
     array([[0., 2.],
            [1., 0.]])
+
     >>> enc.inverse_transform([[1, 0], [0, 1]])
     array([['Male', 1],
            ['Female', 2]], dtype=object)
+
     See also
     --------
     sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
@@ -547,13 +593,16 @@ def __init__(self, categories='auto', dtype=np.float64):
 
     def fit(self, X, y=None):
         """Fit the OrdinalEncoder to X.
+
         Parameters
         ----------
         X : array-like, shape [n_samples, n_features]
             The data to determine the categories of each feature.
+
         Returns
         -------
         self
+
         """
         self._fit(X)
 
@@ -561,28 +610,34 @@ def fit(self, X, y=None):
 
     def transform(self, X):
         """Transform X to ordinal codes.
+
         Parameters
         ----------
         X : array-like, shape [n_samples, n_features]
             The data to encode.
+
         Returns
         -------
         X_out : sparse matrix or a 2-d array
             Transformed input.
+
         """
         X_int, _ = self._transform(X)
         return X_int.astype(self.dtype, copy=False)
 
     def inverse_transform(self, X):
         """Convert the data back to the original representation.
+
         Parameters
         ----------
         X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
             The transformed data.
+
         Returns
         -------
         X_tr : array-like, shape [n_samples, n_features]
             Inverse transformed array.
+
         """
         check_is_fitted(self)
         X = check_array(X, accept_sparse='csr')
@@ -604,4 +659,4 @@ def inverse_transform(self, X):
             labels = X[:, i].astype('int64', copy=False)
             X_tr[:, i] = self.categories_[i][labels]
 
-        return X_tr
\ No newline at end of file
+        return X_tr
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 4804bd03ed6b8..8e1a61781544a 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -8,6 +8,7 @@
 
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.testing import assert_array_equal
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_allclose
 
 from sklearn.preprocessing import OneHotEncoder
@@ -260,8 +261,7 @@ def test_one_hot_encoder_inverse(sparse_, drop):
     # incorrect shape raises
     X_tr = np.array([[0, 1, 1], [1, 0, 1]])
     msg = re.escape('Shape of the passed X data is not correct')
-    with pytest.raises(ValueError, match=msg):
-        enc.inverse_transform(X_tr)
+    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
 
 
 @pytest.mark.parametrize("method", ['fit', 'fit_transform'])
@@ -520,8 +520,7 @@ def test_ordinal_encoder_inverse():
     # incorrect shape raises
     X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
     msg = re.escape('Shape of the passed X data is not correct')
-    with pytest.raises(ValueError, match=msg):
-        enc.inverse_transform(X_tr)
+    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
 
 
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
@@ -552,7 +551,6 @@ def test_ordinal_encoder_raise_categories_shape():
     with pytest.raises(ValueError, match=msg):
         enc.fit(X)
 
-
 def test_encoder_dtypes():
     # check that dtypes are preserved when determining categories
     enc = OneHotEncoder(categories='auto')
@@ -645,9 +643,10 @@ def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
 @pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']])
 def test_invalid_drop_length(drop):
     enc = OneHotEncoder(drop=drop)
-    err_msg = "`drop` should have length equal to the number"
-    with pytest.raises(ValueError, match=err_msg):
-        enc.fit([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
+    assert_raises_regex(
+        ValueError,
+        "`drop` should have length equal to the number",
+        enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
 
 
 @pytest.mark.parametrize("density", [True, False],
@@ -676,4 +675,4 @@ def test_categories(density, drop):
 
 @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
 def test_encoders_has_categorical_tags(Encoder):
-    assert 'categorical' in Encoder()._get_tags()['X_types']
\ No newline at end of file
+    assert 'categorical' in Encoder()._get_tags()['X_types']

From 8a66e43ed39a910b118d71ad47606bbd9c7f22ac Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 18 Sep 2019 11:27:48 +0200
Subject: [PATCH 06/16] clean

---
 .gitignore                     | 5 +++++
 sklearn/preprocessing/label.py | 8 --------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/.gitignore b/.gitignore
index 73feb51e76e2f..52163a5877104 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
+*.ipynb
+*(copy)*
+*.code-workspace
+*thomas*
+
 *.pyc
 *.so
 *.pyd
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 5e8d39b25cdb7..52e7b5df4b122 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -35,14 +35,6 @@
 ]
 
 
-def get_encoding(uniques, values):
-    if np.diff(uniques) > 0:
-        return np.searchsorted(uniques, values)
-    else:
-        table = {val: i for i, val in enumerate(uniques)}
-        return np.array([table[v] for v in values])
-
-
 def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
                   allow_nan=False):
     # only used in _encode below, see docstring there for details

From c613940f8b86de64268a7922c8145d4725fe7222 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 18 Sep 2019 11:29:33 +0200
Subject: [PATCH 07/16] clean

---
 .gitignore | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 52163a5877104..73feb51e76e2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,3 @@
-*.ipynb
-*(copy)*
-*.code-workspace
-*thomas*
-
 *.pyc
 *.so
 *.pyd

From 894c0e5dc68f35aca162fc59e37b1a9b978788d3 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 18 Sep 2019 15:04:07 +0200
Subject: [PATCH 08/16] iter

---
 sklearn/preprocessing/label.py            | 50 ++++++++++++++---------
 sklearn/preprocessing/tests/test_label.py |  4 +-
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index fb3798ee44402..15df02a40986f 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -25,7 +25,7 @@
 from ..utils.validation import _num_samples
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
-from ..impute._base import _get_mask
+from ..utils.mask import _get_mask
 
 __all__ = [
     'label_binarize',
@@ -45,7 +45,7 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
             # np.nan is always sorted last
             if len(uniques) and is_scalar_nan(uniques[-1]):
                 if not allow_nan:
-                    raise ValueError('nan found in values and allow_nan=False')
+                    raise ValueError('Values contains NaN and allow_nan=False')
                 nan_idx = np.searchsorted(uniques, np.nan)
                 uniques = uniques[:nan_idx+1]
                 if encode:
@@ -57,7 +57,7 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
             # np.nan is always sorted last
             if len(uniques) and is_scalar_nan(uniques[-1]):
                 if not allow_nan:
-                    raise ValueError('nan found in values and allow_nan=False')
+                    raise ValueError('Values contains NaN and allow_nan=False')
                 nan_idx = np.searchsorted(uniques, np.nan)
                 uniques = uniques[:nan_idx+1]
             return uniques
@@ -79,9 +79,9 @@ def _encode_python(values, uniques=None, encode=False, allow_nan=False):
         missing_mask = _get_mask(values, np.nan)
         if np.any(missing_mask):
             if not allow_nan:
-                raise ValueError('nan found in values and allow_nan=False')
+                raise ValueError('Values contains NaN and allow_nan=False')
             else:
-                # sorted([4, np.nan]) != np.sort([4, np.nan])
+                # need np.sort to ensure nan is sorted last
                 uniques = np.sort(list(set(values[~missing_mask]) | {np.nan}))
         else:
             uniques = sorted(set(values))
@@ -194,16 +194,20 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
     """
     if values.dtype == object:
         uniques_set = set(uniques)
-        diff = np.array(list(set(values) - uniques_set))
-        # set([np.nan]) - set([np.nan]) returns set()
-        # but set(np.array([np.nan])) - set(np.array([np.nan])) return {nan}
-        if len(diff) and any(_get_mask(diff, np.nan)):
+        values_set = set(values)
+        array_values_set = np.array(values_set)
+        is_nan_in_value = np.any(_object_dtype_isnan(array_values_set))
+        if is_nan_in_value:
             if not allow_nan:
-                raise ValueError('Nan found during check_unknown')
+                raise ValueError('Values contains NaN')
+            elif any(_get_mask(uniques, np.nan)):
+                diff = np.array(array_values_set - uniques_set)
+                diff = diff[~_get_mask(diff, np.nan)]
             else:
-                if any(_get_mask(uniques_set, np.nan)) and\
-                   any(_get_mask(set(values), np.nan)):
-                    diff = diff[~_get_mask(diff, np.nan)]
+                diff = list(values_set - uniques_set)
+        else:
+            diff = list(values_set - uniques_set)
+
         if return_mask:
             if len(diff):
                 valid_mask = np.array([val in uniques_set for val in values])
@@ -214,15 +218,21 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
             return diff
     else:
         unique_values = np.unique(values)
-        diff = np.setdiff1d(unique_values, uniques, assume_unique=True)
-        # np.setdiff1d([np.nan],[np.nan]) returns [np.nan]
-        if any(_get_mask(diff, np.nan)):
+        mask_nan_in_values = _get_mask(unique_values, np.nan)
+        if np.any(mask_nan_in_values):
             if not allow_nan:
-                raise ValueError('Nan found during check_unknown')
+                raise ValueError('Values conatins NaN')
             else:
-                if any(_get_mask(unique_values, np.nan)) and\
-                   any(_get_mask(uniques, np.nan)):
-                    diff = [x for x in diff if not is_scalar_nan(x)]
+                mask_nan_in_uniques = _get_mask(uniques, np.nan)
+                if np.any(mask_nan_in_uniques):
+                    diff = np.setdiff1d(unique_values[~mask_nan_in_values],
+                                        uniques[~mask_nan_in_uniques],
+                                        assume_unique=True)
+                else:
+                    diff = np.setdiff1d(unique_values, uniques, assume_unique=True)
+        else:
+            diff = np.setdiff1d(unique_values, uniques, assume_unique=True)
+
         if return_mask:
             if diff:
                 valid_mask = np.in1d(values, uniques)
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 52fe83e0a83c8..43169776a701d 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -242,7 +242,9 @@ def test_label_encode_with_nan():
                          [np.asarray([np.nan, np.nan], dtype=float),
                           np.asarray([np.nan, np.nan], dtype=object)])
 def test_label_encode_raise_nan(values):
-    assert_raises(ValueError, _encode, values, allow_nan=False)
+    msg = 'Values contains NaN'
+    with pytest.raises(ValueError, match=msg):
+        _encode(values, allow_nan=False)
 
 
 @pytest.mark.parametrize("dtype", ['str', 'object'])

From 1a266b0f8cc0859e2dde5f8f693bf8dd8c197128 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 18 Sep 2019 15:12:24 +0200
Subject: [PATCH 09/16] typo

---
 sklearn/preprocessing/label.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 15df02a40986f..bb41359713ac0 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -229,7 +229,8 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
                                         uniques[~mask_nan_in_uniques],
                                         assume_unique=True)
                 else:
-                    diff = np.setdiff1d(unique_values, uniques, assume_unique=True)
+                    diff = np.setdiff1d(unique_values, uniques,
+                                        assume_unique=True)
         else:
             diff = np.setdiff1d(unique_values, uniques, assume_unique=True)
 

From afad176b6a1b709df5bf73315122d99efed1ca29 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 18 Sep 2019 17:04:09 +0200
Subject: [PATCH 10/16] add functions

---
 sklearn/preprocessing/label.py            | 100 +++++++++++++---------
 sklearn/preprocessing/tests/test_label.py |  31 +++++++
 2 files changed, 90 insertions(+), 41 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index bb41359713ac0..2294b6f894035 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -35,31 +35,43 @@
 ]
 
 
+def _nan_unique(ar, return_inverse=False, allow_nan=False):
+    # mimic np.unique with allow_nan option
+
+    if return_inverse:
+        uniques, inverse = np.unique(ar, return_inverse=True)
+    else:
+        uniques = np.unique(ar)
+
+    nan_idx = None
+    # np.nan is always sorted last
+    if len(uniques) and is_scalar_nan(uniques[-1]):
+        if not allow_nan:
+            raise ValueError('Values contains NaN and allow_nan=False')
+        nan_idx = np.searchsorted(uniques, np.nan)
+        uniques = uniques[:nan_idx+1]
+
+    if return_inverse and nan_idx is not None:
+        inverse[inverse > nan_idx] = nan_idx
+
+    if return_inverse:
+        return uniques, inverse
+    else:
+        return uniques
+
+
 def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
                   allow_nan=False):
     # only used in _encode below, see docstring there for details
 
     if uniques is None:
         if encode:
-            uniques, encoded = np.unique(values, return_inverse=True)
-            # np.nan is always sorted last
-            if len(uniques) and is_scalar_nan(uniques[-1]):
-                if not allow_nan:
-                    raise ValueError('Values contains NaN and allow_nan=False')
-                nan_idx = np.searchsorted(uniques, np.nan)
-                uniques = uniques[:nan_idx+1]
-                if encode:
-                    encoded[encoded > nan_idx] = nan_idx
+            uniques, encoded = _nan_unique(values, return_inverse=True,
+                                           allow_nan=allow_nan)
             return uniques, encoded
         else:
             # unique sorts
-            uniques = np.unique(values)
-            # np.nan is always sorted last
-            if len(uniques) and is_scalar_nan(uniques[-1]):
-                if not allow_nan:
-                    raise ValueError('Values contains NaN and allow_nan=False')
-                nan_idx = np.searchsorted(uniques, np.nan)
-                uniques = uniques[:nan_idx+1]
+            uniques = _nan_unique(values, allow_nan=allow_nan)
             return uniques
     if encode:
         if check_unknown:
@@ -73,6 +85,25 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
         return uniques
 
 
+class TableWithNan(object):
+    #  hash table which allows nan as a key
+
+    def __init__(self):
+        self.dict = dict()
+        self.nan_value = None
+
+    def get(self, key):
+        if is_scalar_nan(key) and self.nan_value is not None:
+            return self.nan_value
+        return self.dict[key]
+
+    def set(self, key, value):
+        if is_scalar_nan(key):
+            self.nan_value = value
+        else:
+            self.dict[key] = value
+
+
 def _encode_python(values, uniques=None, encode=False, allow_nan=False):
     # only used in _encode below, see docstring there for details
     if uniques is None:
@@ -87,29 +118,15 @@ def _encode_python(values, uniques=None, encode=False, allow_nan=False):
             uniques = sorted(set(values))
         uniques = np.array(uniques, dtype=values.dtype)
     if encode:
-        table = dict()
+        # hash is not enough to identify nan
+        table = TableWithNan()
         for i, val in enumerate(uniques):
-            if is_scalar_nan(val):
-                # table[nan] always raise KeyError
-                nan_index = i
-            else:
-                table[val] = i
+            table.set(val, i)
         try:
-            encoded = []
-            for val in values:
-                if is_scalar_nan(val):
-                    encoded.append(nan_index)
-                else:
-                    encoded.append(table[val])
-            encoded = np.array(encoded)
+            encoded = np.array([table.get(val) for val in values])
         except KeyError as e:
             raise ValueError("y contains previously unseen labels: %s"
                              % str(e))
-        except UnboundLocalError as e:
-            # 'nan_index' referenced before assignment
-            raise ValueError("y contains previously unseen label nan, "
-                             "consider using allow_nan=True. %s"
-                             % str(e))
         return uniques, encoded
     else:
         return uniques
@@ -143,7 +160,7 @@ def _encode(values, uniques=None, encode=False, check_unknown=True,
         _BaseEncoder._transform() to avoid calling _encode_check_unknown()
         twice.
     allow_nan : bool, default False
-        if True, encode np.nan as another category. Otherwise raise an error
+        if True, encode `np.nan` as another category. Otherwise raise an error
         if nan are present
 
     Returns
@@ -182,6 +199,8 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
     return_mask : bool, default False
         If True, return a mask of the same shape as `values` indicating
         the valid values.
+    allow_nan : bool, default False
+        If False, raise an error if NaN are present.
 
     Returns
     -------
@@ -195,21 +214,20 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
     if values.dtype == object:
         uniques_set = set(uniques)
         values_set = set(values)
-        array_values_set = np.array(values_set)
-        is_nan_in_value = np.any(_object_dtype_isnan(array_values_set))
+        is_nan_in_value = any([is_scalar_nan(val) for val in values_set])
         if is_nan_in_value:
             if not allow_nan:
                 raise ValueError('Values contains NaN')
             elif any(_get_mask(uniques, np.nan)):
-                diff = np.array(array_values_set - uniques_set)
-                diff = diff[~_get_mask(diff, np.nan)]
+                diff = np.array(values_set - uniques_set)
+                diff = list(diff[~_get_mask(diff, np.nan)])
             else:
                 diff = list(values_set - uniques_set)
         else:
             diff = list(values_set - uniques_set)
 
         if return_mask:
-            if len(diff):
+            if diff:
                 valid_mask = np.array([val in uniques_set for val in values])
             else:
                 valid_mask = np.ones(len(values), dtype=bool)
@@ -221,7 +239,7 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
         mask_nan_in_values = _get_mask(unique_values, np.nan)
         if np.any(mask_nan_in_values):
             if not allow_nan:
-                raise ValueError('Values conatins NaN')
+                raise ValueError('Values contains NaN')
             else:
                 mask_nan_in_uniques = _get_mask(uniques, np.nan)
                 if np.any(mask_nan_in_uniques):
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 43169776a701d..7d2a24d729408 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -23,6 +23,7 @@
 from sklearn.preprocessing.label import _inverse_binarize_thresholding
 from sklearn.preprocessing.label import _inverse_binarize_multiclass
 from sklearn.preprocessing.label import _encode
+from sklearn.preprocessing.label import _encode_check_unknown
 
 from sklearn import datasets
 
@@ -718,3 +719,33 @@ def test_encode_check_unknown_nan_object(uniques, values):
                        match='y contains previously unseen label'):
         _encode(values, uniques, encode=True, check_unknown=True,
                 allow_nan=True)
+
+
+@pytest.mark.parametrize("return_mask", [True, False])
+@pytest.mark.parametrize(
+        "uniques, values",
+        [(np.array(['a', 'b', 'c'], dtype=object),
+          np.array(['a', 'b', 'c', np.nan], dtype=object)),
+         (np.array([np.nan, 'b', 'c'], dtype=object),
+          np.array([np.nan, 'b', 'c', 'd'], dtype=object)),
+         (np.array([1, 2, 3]),
+          np.array([1, 2, 3, np.nan])),
+         (np.array([np.nan, 2, 3]),
+          np.array([np.nan, 2, 3, 4]))])
+def test_check_unknown_nan_raise(uniques, values, return_mask):
+    # test for the check_unknown parameter of _encode() with nan present
+
+    with pytest.raises(ValueError,
+                       match='Values contains NaN'):
+        _encode_check_unknown(values, uniques, return_mask=return_mask,
+                              allow_nan=False)
+
+
+def test_nan_unique():
+    # TODO
+    pass
+
+
+def test_table_with_nan():
+    # TODO
+    pass

From db0513699346589ff8cd6cd7ec1f52a80b50befc Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 18 Sep 2019 17:19:23 +0200
Subject: [PATCH 11/16] ad test

---
 sklearn/preprocessing/label.py            |  1 -
 sklearn/preprocessing/tests/test_label.py | 67 +++++++++++------------
 2 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 2294b6f894035..764c6932fc404 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -19,7 +19,6 @@
 from ..utils.sparsefuncs import min_max_axis
 from ..utils import column_or_1d
 from ..utils import is_scalar_nan
-from ..utils.fixes import _object_dtype_isnan
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _num_samples
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 7d2a24d729408..9bcebea018bcb 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -219,35 +219,6 @@ def test_label_encoder_negative_ints():
         le.transform([0, 6])
 
 
-def test_label_encode_with_nan():
-
-    # encode all nan within one category
-    assert len(_encode(np.asarray([np.nan, np.nan], dtype=float),
-               allow_nan=True)) == 1
-    assert len(_encode(np.asarray([np.nan, np.nan], dtype=object),
-               allow_nan=True)) == 1
-    assert len(_encode(np.asarray([4, np.nan, np.nan]), allow_nan=True)) == 2
-
-    # the encoded size corresponds to the values size
-    assert len(_encode(np.asarray([np.nan, np.nan], dtype=float),
-               encode=True, allow_nan=True)[1]) == 2
-    assert len(_encode(np.asarray([np.nan, np.nan], dtype=object),
-               encode=True, allow_nan=True)[1]) == 2
-
-    encoded = _encode(np.asarray([4, 5, np.nan, np.nan, np.nan]),
-                      encode=True, allow_nan=True)[1]
-    assert_array_equal(encoded, [0, 1, 2, 2, 2])
-
-
-@pytest.mark.parametrize("values",
-                         [np.asarray([np.nan, np.nan], dtype=float),
-                          np.asarray([np.nan, np.nan], dtype=object)])
-def test_label_encode_raise_nan(values):
-    msg = 'Values contains NaN'
-    with pytest.raises(ValueError, match=msg):
-        _encode(values, allow_nan=False)
-
-
 @pytest.mark.parametrize("dtype", ['str', 'object'])
 def test_label_encoder_str_bad_shape(dtype):
     le = LabelEncoder()
@@ -642,6 +613,7 @@ def test_inverse_binarize_multiclass():
     assert_array_equal(got, np.array([1, 1, 0]))
 
 
+@pytest.mark.parametrize("allow_nan", [True, False])
 @pytest.mark.parametrize(
         "values, expected",
         [(np.array([2, 1, 3, 1, 3], dtype='int64'),
@@ -651,19 +623,44 @@ def test_inverse_binarize_multiclass():
          (np.array(['b', 'a', 'c', 'a', 'c']),
           np.array(['a', 'b', 'c']))],
         ids=['int64', 'object', 'str'])
-def test_encode_util(values, expected):
+def test_encode_util(values, expected, allow_nan):
     uniques = _encode(values)
     assert_array_equal(uniques, expected)
-    uniques, encoded = _encode(values, encode=True)
+    uniques, encoded = _encode(values, encode=True, allow_nan=allow_nan)
     assert_array_equal(uniques, expected)
     assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
-    _, encoded = _encode(values, uniques, encode=True)
+    _, encoded = _encode(values, uniques, encode=True, allow_nan=allow_nan)
     assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
 
 
-@pytest.mark.parametrize(
-        "allow_nan",
-        [True, False])
+@pytest.mark.parametrize("dtype", [float, object])
+def test_label_encode_with_nan(dtype):
+
+    # encode all nan within one category
+    assert len(_encode(np.asarray([np.nan, np.nan, float('nan')], dtype=dtype),
+               allow_nan=True)) == 1
+    assert len(_encode(np.asarray([4, np.nan, float('nan')], dtype=dtype),
+               allow_nan=True)) == 2
+
+    # the encoded size corresponds to the values size
+    assert len(_encode(np.asarray([np.nan, np.nan], dtype=dtype),
+               encode=True, allow_nan=True)[1]) == 2
+
+    encoded = _encode(np.asarray([4, 5, np.nan, np.nan, np.nan], dtype=dtype),
+                      encode=True, allow_nan=True)[1]
+    assert_array_equal(encoded, [0, 1, 2, 2, 2])
+
+
+@pytest.mark.parametrize("values",
+                         [np.asarray([np.nan, np.nan], dtype=float),
+                          np.asarray([np.nan, np.nan], dtype=object)])
+def test_label_encode_raise_nan(values):
+    msg = 'Values contains NaN'
+    with pytest.raises(ValueError, match=msg):
+        _encode(values, allow_nan=False)
+
+
+@pytest.mark.parametrize("allow_nan", [True, False])
 def test_encode_check_unknown(allow_nan):
     # test for the check_unknown parameter of _encode()
     uniques = np.array([1, 2, 3])

From b7284f69d874fc01158205caf4f2308b5508ea1b Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Thu, 19 Sep 2019 15:23:27 +0200
Subject: [PATCH 12/16] add more tests

---
 sklearn/preprocessing/label.py            |   4 +-
 sklearn/preprocessing/tests/test_label.py | 106 ++++++++++++++++++++--
 2 files changed, 102 insertions(+), 8 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 764c6932fc404..a2282f3ed6d7e 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -84,7 +84,7 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
         return uniques
 
 
-class TableWithNan(object):
+class _TableWithNan(object):
     #  hash table which allows nan as a key
 
     def __init__(self):
@@ -118,7 +118,7 @@ def _encode_python(values, uniques=None, encode=False, allow_nan=False):
         uniques = np.array(uniques, dtype=values.dtype)
     if encode:
         # hash is not enough to identify nan
-        table = TableWithNan()
+        table = _TableWithNan()
         for i, val in enumerate(uniques):
             table.set(val, i)
         try:
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 9bcebea018bcb..5e7ef6103d8d8 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -10,6 +10,7 @@
 from scipy.sparse import lil_matrix
 
 from sklearn.utils.multiclass import type_of_target
+from sklearn.utils import is_scalar_nan
 
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_warns_message
@@ -22,8 +23,10 @@
 
 from sklearn.preprocessing.label import _inverse_binarize_thresholding
 from sklearn.preprocessing.label import _inverse_binarize_multiclass
-from sklearn.preprocessing.label import _encode
+from sklearn.preprocessing.label import _encode, _encode_numpy, _encode_python
 from sklearn.preprocessing.label import _encode_check_unknown
+from sklearn.preprocessing.label import _nan_unique
+from sklearn.preprocessing.label import _TableWithNan
 
 from sklearn import datasets
 
@@ -738,11 +741,102 @@ def test_check_unknown_nan_raise(uniques, values, return_mask):
                               allow_nan=False)
 
 
-def test_nan_unique():
-    # TODO
-    pass
+@pytest.mark.parametrize(
+        ["values", "unique", "inverse"],
+        [(np.array([]), [], []),
+         (np.array(['a', 'a', 'a'], dtype=object), ['a'], [0, 0, 0]),
+         (np.array(['a', 'c', 'b'], dtype=object), ['a', 'b', 'c'], [0, 2, 1]),
+         (np.array(['a', 'b', 'c', 'a', 'b'], dtype=object), ['a', 'b', 'c'],
+          [0, 1, 2, 0, 1]),
+         (np.array([1, 2, 3]), [1, 2, 3], [0, 1, 2]),
+         (np.array([1, 1, 1]), [1], [0, 0, 0]),
+         (np.array([1, 2, 3, 3, 2, 1]), [1, 2, 3], [0, 1, 2, 2, 1, 0]),
+         ])
+def test_nan_unique_same_as_np(values, unique, inverse):
+    #  assert _nan_unique == np.unique
+
+    assert_array_equal(unique, _nan_unique(values))
+    assert_array_equal(unique, np.unique(values))
+
+    u, i = _nan_unique(values, return_inverse=True)
+    assert_array_equal(unique, u)
+    assert_array_equal(inverse, i)
+    u, i = np.unique(values, return_inverse=True)
+    assert_array_equal(unique, u)
+    assert_array_equal(inverse, i)
+
+
+@pytest.mark.parametrize(
+        ["values", "unique", "inverse"],
+        [(np.array([]), [], []),
+         (np.array([np.nan, np.nan, float('nan')]), [np.nan], [0, 0, 0]),
+         #  (np.array([np.nan, 'a', 'a'], dtype=object),
+         #   ['a', np.nan], [1, 0, 0]),
+         #  (np.array([np.nan, 'c', 'b'], dtype=object),
+         #   ['b', 'c', np.nan], [0, 2, 1]),
+         #  (np.array([np.nan, 'b', 'c', 'a', 'b'], dtype=object),
+         #   ['a', 'b', 'c', np.nan], [3, 1, 2, 0, 1]),
+         (np.array([np.nan, 2, 3]), [2, 3, np.nan], [2, 0, 1]),
+         (np.array([np.nan, 1, 1]), [1, np.nan], [1, 0, 0]),
+         (np.array([np.nan, 2, 3, 3, 2, 1]), [1, 2, 3, np.nan],
+          [3, 1, 2, 2, 1, 0]),
+         ])
+def test_nan_unique_nan(values, unique, inverse):
+    nan_unique, nan_inverse = _nan_unique(values, return_inverse=True,
+                                          allow_nan=True)
+    for nu, u in zip(nan_unique, unique):
+        if is_scalar_nan(nu):
+            assert is_scalar_nan(u)
+        else:
+            assert nu == u
+    for ni, i in zip(nan_inverse, inverse):
+        if is_scalar_nan(ni):
+            assert is_scalar_nan(i)
+        else:
+            assert ni == i
+
+
+@pytest.mark.parametrize('encode_type', [_encode_numpy, _encode_python])
+@pytest.mark.parametrize(
+        ["values", "unique", "inverse"],
+        [(np.array([]), [], []),
+         (np.array([np.nan, np.nan, float('nan')]), [np.nan], [0, 0, 0]),
+         (np.array([np.nan, 2, 3]), [2, 3, np.nan], [2, 0, 1]),
+         (np.array([np.nan, 1, 1]), [1, np.nan], [1, 0, 0]),
+         (np.array([np.nan, 2, 3, 3, 2, 1]), [1, 2, 3, np.nan],
+          [3, 1, 2, 2, 1, 0]),
+         ])
+def test_nan_encode_numpy_python(values, unique, inverse, encode_type):
+    nan_unique, nan_inverse = encode_type(values, encode=True, allow_nan=True)
+    for nu, u in zip(nan_unique, unique):
+        if is_scalar_nan(nu):
+            assert is_scalar_nan(u)
+        else:
+            assert nu == u
+    for ni, i in zip(nan_inverse, inverse):
+        if is_scalar_nan(ni):
+            assert is_scalar_nan(i)
+        else:
+            assert ni == i
 
 
 def test_table_with_nan():
-    # TODO
-    pass
+    table = _TableWithNan()
+    table.set('a', 0)
+    table.set(42, 42)
+
+    with pytest.raises(KeyError):
+        table.get(np.nan)
+    with pytest.raises(KeyError):
+        table.get(float('nan'))
+    with pytest.raises(KeyError):
+        table.get('b')
+
+    table.set(np.nan, 1)
+    assert table.get('a') == 0
+    assert table.get(42) == 42
+    assert table.get(np.nan) == 1
+    assert table.get(float('nan')) == 1
+
+    with pytest.raises(KeyError):
+        table.get(None)

From c4c4982180716f87470fabe93a4a6b4eac1d46a2 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Thu, 19 Sep 2019 16:34:35 +0200
Subject: [PATCH 13/16] add more tests

---
 sklearn/preprocessing/label.py            |  11 +-
 sklearn/preprocessing/tests/test_label.py | 122 +++++++++++++++-------
 2 files changed, 93 insertions(+), 40 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index a2282f3ed6d7e..d75e00330f45a 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -217,11 +217,14 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
         if is_nan_in_value:
             if not allow_nan:
                 raise ValueError('Values contains NaN')
-            elif any(_get_mask(uniques, np.nan)):
-                diff = np.array(values_set - uniques_set)
-                diff = list(diff[~_get_mask(diff, np.nan)])
+            if any(_get_mask(uniques, np.nan)):
+                diff = list(values_set - uniques_set)
+                if diff:
+                    diff = np.array(diff)
+                    diff = list(diff[~_get_mask(diff, np.nan)])
             else:
                 diff = list(values_set - uniques_set)
+                # diff = [] ###
         else:
             diff = list(values_set - uniques_set)
 
@@ -252,7 +255,7 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
             diff = np.setdiff1d(unique_values, uniques, assume_unique=True)
 
         if return_mask:
-            if diff:
+            if len(diff):
                 valid_mask = np.in1d(values, uniques)
             else:
                 valid_mask = np.ones(len(values), dtype=bool)
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 5e7ef6103d8d8..4764e04cbd975 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -664,11 +664,18 @@ def test_label_encode_raise_nan(values):
 
 
 @pytest.mark.parametrize("allow_nan", [True, False])
-def test_encode_check_unknown(allow_nan):
+@pytest.mark.parametrize(
+        "uniques, values",
+        [(np.array(['a', 'b', 'c'], dtype=object),
+          np.array(['a', 'b', 'c', 'd'], dtype=object)),
+         (np.array([], dtype=object),
+          np.array([1], dtype=object)),
+         (np.array([], dtype=float),
+          np.array([1], dtype=float)),
+         (np.array([1, 2, 3]),
+          np.array([1, 2, 3, 4]))])
+def test_encode_check_unknown(values, uniques, allow_nan):
     # test for the check_unknown parameter of _encode()
-    uniques = np.array([1, 2, 3])
-    values = np.array([1, 2, 3, 4])
-
     # Default is True, raise error
     with pytest.raises(ValueError,
                        match='y contains previously unseen labels'):
@@ -676,16 +683,10 @@ def test_encode_check_unknown(allow_nan):
                 allow_nan=allow_nan)
 
     # dont raise error if False
-    _encode(values, uniques, encode=True, check_unknown=False,
-            allow_nan=allow_nan)
-
-    # parameter is ignored for object dtype
-    uniques = np.array(['a', 'b', 'c'], dtype=object)
-    values = np.array(['a', 'b', 'c', 'd'], dtype=object)
-    with pytest.raises(ValueError,
-                       match='y contains previously unseen labels'):
-        _encode(values, uniques, encode=True, check_unknown=False,
-                allow_nan=allow_nan)
+    # check_unknown is always True for dtype object
+    if values.dtype != object:
+            _encode(values, uniques, encode=True, check_unknown=False,
+                    allow_nan=allow_nan)
 
 
 @pytest.mark.parametrize(
@@ -741,8 +742,67 @@ def test_check_unknown_nan_raise(uniques, values, return_mask):
                               allow_nan=False)
 
 
+@pytest.mark.parametrize('allow_nan', [True, False])
 @pytest.mark.parametrize(
-        ["values", "unique", "inverse"],
+        "values, uniques, diff, mask",
+        [(np.array(['a', 'a', 'a'], dtype=object), ['a'], [], [1, 1, 1]),
+         (np.array(['a', 'c', 'b'], dtype=object), ['a', 'b', 'c'], [],
+          [1, 1, 1]),
+         (np.array(['a', 'b', 'c', 'a', 'b'], dtype=object), ['a', 'b', 'c'],
+          [], [1, 1, 1, 1, 1]),
+         (np.array([1, 2, 3]), [1, 2, 3], [], [1, 1, 1]),
+         (np.array([1, 1, 1]), [1], [], [1, 1, 1]),
+         (np.array([1, 2, 3, 3, 2, 1]), [1, 2, 3], [], [1] * 6),
+         ])
+def test_encode_check_unknown_diff(values, uniques, diff, mask, allow_nan):
+
+    diff_, mask_ = _encode_check_unknown(values, uniques, return_mask=True,
+                                         allow_nan=allow_nan)
+    assert_array_equal(diff, diff_)
+    assert_array_equal(mask, mask_)
+
+
+@pytest.mark.parametrize(
+        "values, uniques, diff, mask",
+        [(np.array([1, 2, np.nan]), np.array([1, 2, np.nan]), [], [1, 1, 1]),
+         (np.array([1, 1, float('nan')]), np.array([1, np.nan]), [], [1, 1, 1]),
+         (np.array([1, np.nan, 3, 3, 2, 1]), np.array([1, 2, 3, np.nan]),
+          [], [1] * 6),
+         ])
+def test_encode_check_unknown_diff_with_nan(values, uniques, diff, mask):
+
+    diff_, mask_ = _encode_check_unknown(values, uniques, return_mask=True,
+                                         allow_nan=True)
+    assert_array_equal(diff, diff_)
+    assert_array_equal(mask, mask_)
+
+
+def assert_array_equal_with_nan(x, y):
+    for a, b in zip(x, y):
+        if is_scalar_nan(a):
+            assert is_scalar_nan(b)
+        else:
+            assert a == b
+
+
+@pytest.mark.parametrize(
+        "values, uniques, diff, mask",
+        [(np.array([1, 2, np.nan]), np.array([1, 2]), [np.nan], [1, 1, 0]),
+         (np.array([np.nan, float('nan')]), np.array([9]), [np.nan], [0, 0]),
+         (np.array([np.nan, 1, 1]), np.array([1]), [float('nan')], [0, 1, 1]),
+         (np.array([1, np.nan, 3, 3, 2, 1]), np.array([1, 2, 3]),
+          [], [1, 0, 1, 1, 1, 1]),
+         ])
+def test_encode_check_unknown_diff_nan_unseen(values, uniques, diff, mask):
+
+    diff_, mask_ = _encode_check_unknown(values, uniques, return_mask=True,
+                                         allow_nan=True)
+    assert_array_equal_with_nan(mask, mask_)
+    assert_array_equal_with_nan(diff, diff_)
+
+
+@pytest.mark.parametrize(
+        "values, unique, inverse",
         [(np.array([]), [], []),
          (np.array(['a', 'a', 'a'], dtype=object), ['a'], [0, 0, 0]),
          (np.array(['a', 'c', 'b'], dtype=object), ['a', 'b', 'c'], [0, 2, 1]),
@@ -767,7 +827,7 @@ def test_nan_unique_same_as_np(values, unique, inverse):
 
 
 @pytest.mark.parametrize(
-        ["values", "unique", "inverse"],
+        "values, unique, inverse",
         [(np.array([]), [], []),
          (np.array([np.nan, np.nan, float('nan')]), [np.nan], [0, 0, 0]),
          #  (np.array([np.nan, 'a', 'a'], dtype=object),
@@ -784,16 +844,8 @@ def test_nan_unique_same_as_np(values, unique, inverse):
 def test_nan_unique_nan(values, unique, inverse):
     nan_unique, nan_inverse = _nan_unique(values, return_inverse=True,
                                           allow_nan=True)
-    for nu, u in zip(nan_unique, unique):
-        if is_scalar_nan(nu):
-            assert is_scalar_nan(u)
-        else:
-            assert nu == u
-    for ni, i in zip(nan_inverse, inverse):
-        if is_scalar_nan(ni):
-            assert is_scalar_nan(i)
-        else:
-            assert ni == i
+    assert_array_equal_with_nan(nan_unique, unique)
+    assert_array_equal_with_nan(nan_inverse, inverse)
 
 
 @pytest.mark.parametrize('encode_type', [_encode_numpy, _encode_python])
@@ -808,16 +860,14 @@ def test_nan_unique_nan(values, unique, inverse):
          ])
 def test_nan_encode_numpy_python(values, unique, inverse, encode_type):
     nan_unique, nan_inverse = encode_type(values, encode=True, allow_nan=True)
-    for nu, u in zip(nan_unique, unique):
-        if is_scalar_nan(nu):
-            assert is_scalar_nan(u)
-        else:
-            assert nu == u
-    for ni, i in zip(nan_inverse, inverse):
-        if is_scalar_nan(ni):
-            assert is_scalar_nan(i)
-        else:
-            assert ni == i
+    assert_array_equal_with_nan(nan_unique, unique)
+    assert_array_equal_with_nan(nan_inverse, inverse)
+
+    # test also _nan_unique
+    nan_unique, nan_inverse = _nan_unique(values, return_inverse=True,
+                                          allow_nan=True)
+    assert_array_equal_with_nan(nan_unique, unique)
+    assert_array_equal_with_nan(nan_inverse, inverse)
 
 
 def test_table_with_nan():

From 6d9d0554282cce28fa0cd58e3536eb61d7edc897 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Mon, 23 Sep 2019 17:40:16 +0200
Subject: [PATCH 14/16] rename __DictWithNan

---
 sklearn/preprocessing/label.py            | 30 ++++++----
 sklearn/preprocessing/tests/test_label.py | 68 ++++++++++++-----------
 2 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index d75e00330f45a..c14ccba48a272 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -35,7 +35,13 @@
 
 
 def _nan_unique(ar, return_inverse=False, allow_nan=False):
-    # mimic np.unique with allow_nan option
+    """ mimic np.unique where all nan are treated as the same one
+
+    If allow_nan is False, ValueError is raise if ar contains nan.
+    Otherwise, if `ar` contains (possibly some) nan,
+    `uniques` will contains only one nan (contrary to np.unique), and
+    `inverse` will map all the nan from `ar` to this single nan in `uniques`.
+    """
 
     if return_inverse:
         uniques, inverse = np.unique(ar, return_inverse=True)
@@ -84,23 +90,23 @@ def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
         return uniques
 
 
-class _TableWithNan(object):
-    #  hash table which allows nan as a key
+class _DictWithNan(dict):
+    # dict which allows nan as a key
 
     def __init__(self):
-        self.dict = dict()
         self.nan_value = None
 
-    def get(self, key):
+    def __getitem__(self, key):
         if is_scalar_nan(key) and self.nan_value is not None:
             return self.nan_value
-        return self.dict[key]
+        else:
+            return self.__dict__[key]
 
-    def set(self, key, value):
+    def __setitem__(self, key, item):
         if is_scalar_nan(key):
-            self.nan_value = value
+            self.nan_value = item
         else:
-            self.dict[key] = value
+            self.__dict__[key] = item
 
 
 def _encode_python(values, uniques=None, encode=False, allow_nan=False):
@@ -118,11 +124,11 @@ def _encode_python(values, uniques=None, encode=False, allow_nan=False):
         uniques = np.array(uniques, dtype=values.dtype)
     if encode:
         # hash is not enough to identify nan
-        table = _TableWithNan()
+        table = _DictWithNan()
         for i, val in enumerate(uniques):
-            table.set(val, i)
+            table[val] = i
         try:
-            encoded = np.array([table.get(val) for val in values])
+            encoded = np.array([table[val] for val in values])
         except KeyError as e:
             raise ValueError("y contains previously unseen labels: %s"
                              % str(e))
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 4764e04cbd975..52e071986d7fc 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -26,7 +26,7 @@
 from sklearn.preprocessing.label import _encode, _encode_numpy, _encode_python
 from sklearn.preprocessing.label import _encode_check_unknown
 from sklearn.preprocessing.label import _nan_unique
-from sklearn.preprocessing.label import _TableWithNan
+from sklearn.preprocessing.label import _DictWithNan
 
 from sklearn import datasets
 
@@ -636,24 +636,6 @@ def test_encode_util(values, expected, allow_nan):
     assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
 
 
-@pytest.mark.parametrize("dtype", [float, object])
-def test_label_encode_with_nan(dtype):
-
-    # encode all nan within one category
-    assert len(_encode(np.asarray([np.nan, np.nan, float('nan')], dtype=dtype),
-               allow_nan=True)) == 1
-    assert len(_encode(np.asarray([4, np.nan, float('nan')], dtype=dtype),
-               allow_nan=True)) == 2
-
-    # the encoded size corresponds to the values size
-    assert len(_encode(np.asarray([np.nan, np.nan], dtype=dtype),
-               encode=True, allow_nan=True)[1]) == 2
-
-    encoded = _encode(np.asarray([4, 5, np.nan, np.nan, np.nan], dtype=dtype),
-                      encode=True, allow_nan=True)[1]
-    assert_array_equal(encoded, [0, 1, 2, 2, 2])
-
-
 @pytest.mark.parametrize("values",
                          [np.asarray([np.nan, np.nan], dtype=float),
                           np.asarray([np.nan, np.nan], dtype=object)])
@@ -765,7 +747,8 @@ def test_encode_check_unknown_diff(values, uniques, diff, mask, allow_nan):
 @pytest.mark.parametrize(
         "values, uniques, diff, mask",
         [(np.array([1, 2, np.nan]), np.array([1, 2, np.nan]), [], [1, 1, 1]),
-         (np.array([1, 1, float('nan')]), np.array([1, np.nan]), [], [1, 1, 1]),
+         (np.array([1, 1, float('nan')]), np.array([1, np.nan]),
+          [], [1, 1, 1]),
          (np.array([1, np.nan, 3, 3, 2, 1]), np.array([1, 2, 3, np.nan]),
           [], [1] * 6),
          ])
@@ -785,6 +768,25 @@ def assert_array_equal_with_nan(x, y):
             assert a == b
 
 
+@pytest.mark.parametrize(
+                         "values, uniques, encoded",
+                         [(np.array([4, np.nan, float('nan')]), [4, np.nan],
+                          [0, 1, 1]),
+                          (np.array([np.nan, float('nan')]), [np.nan],
+                          [0, 0]),
+                          (np.array([np.nan, 4, np.nan, 4]), [4, np.nan],
+                          [1, 0, 1, 0]),
+                          (np.array([np.nan]), [np.nan], [0]),
+                          ])
+def test_label_encode_with_nan(values, uniques, encoded):
+
+    assert_array_equal_with_nan(_encode(values, allow_nan=True), uniques)
+
+    uniques_, encoded_ = _encode(values, encode=True, allow_nan=True)
+    assert_array_equal_with_nan(uniques, uniques_)
+    assert_array_equal_with_nan(encoded, encoded_)
+
+
 @pytest.mark.parametrize(
         "values, uniques, diff, mask",
         [(np.array([1, 2, np.nan]), np.array([1, 2]), [np.nan], [1, 1, 0]),
@@ -870,23 +872,23 @@ def test_nan_encode_numpy_python(values, unique, inverse, encode_type):
     assert_array_equal_with_nan(nan_inverse, inverse)
 
 
-def test_table_with_nan():
-    table = _TableWithNan()
-    table.set('a', 0)
-    table.set(42, 42)
+def test_dict_with_nan():
+    table = _DictWithNan()
+    table['a'] = 0
+    table[42] = 42
 
     with pytest.raises(KeyError):
-        table.get(np.nan)
+        table[np.nan]
     with pytest.raises(KeyError):
-        table.get(float('nan'))
+        table[float('nan')]
     with pytest.raises(KeyError):
-        table.get('b')
+        table['b']
 
-    table.set(np.nan, 1)
-    assert table.get('a') == 0
-    assert table.get(42) == 42
-    assert table.get(np.nan) == 1
-    assert table.get(float('nan')) == 1
+    table[np.nan] = 1
+    assert table['a'] == 0
+    assert table[42] == 42
+    assert table[np.nan] == 1
+    assert table[float('nan')] == 1
 
     with pytest.raises(KeyError):
-        table.get(None)
+        table[None]

From c76b42b70ceb4ff9847f6f4e7b4fe3c73cb71836 Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Mon, 23 Sep 2019 17:57:12 +0200
Subject: [PATCH 15/16] typo

---
 sklearn/preprocessing/tests/test_label.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 52e071986d7fc..b6886e87d943c 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -667,8 +667,8 @@ def test_encode_check_unknown(values, uniques, allow_nan):
     # dont raise error if False
     # check_unknown is always True for dtype object
     if values.dtype != object:
-            _encode(values, uniques, encode=True, check_unknown=False,
-                    allow_nan=allow_nan)
+        _encode(values, uniques, encode=True, check_unknown=False,
+                allow_nan=allow_nan)
 
 
 @pytest.mark.parametrize(

From f3a120d2a328b1d9b7b576b1444a395e823cb6fb Mon Sep 17 00:00:00 2001
From: twsthomas <twsthomas@gmail.com>
Date: Wed, 25 Sep 2019 11:15:48 +0200
Subject: [PATCH 16/16] minor

---
 sklearn/preprocessing/label.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index c14ccba48a272..cd320455f8970 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -259,6 +259,7 @@ def _encode_check_unknown(values, uniques, return_mask=False, allow_nan=False):
                                         assume_unique=True)
         else:
             diff = np.setdiff1d(unique_values, uniques, assume_unique=True)
+        diff = list(diff)
 
         if return_mask:
             if len(diff):