From bd2b938476e2d42c0a2d8e4ce95491d70fcf3b49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois-Michel=20De=20Rainville?= Date: Thu, 16 May 2019 10:23:06 -0400 Subject: [PATCH 1/6] Added handle unknown option to ordinal encoder --- sklearn/preprocessing/_encoders.py | 43 +++++++++++++++-- sklearn/preprocessing/tests/test_encoders.py | 50 ++++++++++++++++++++ 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 6d11e7907984a..28ddf4a59a67c 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -876,6 +876,14 @@ class OrdinalEncoder(_BaseEncoder): dtype : number type, default np.float64 Desired dtype of output. + + handle_unknown : 'error' or 'ignore', default='error'. + Whether to raise an error or ignore if an unknown categorical feature + is present during transform (default is to raise). When this parameter + is set to 'ignore' and an unknown category is encountered during + transform, a numeric ``-1`` category is returned. In the inverse + transform, an unknown category will be denoted as None. + Attributes ---------- @@ -894,7 +902,8 @@ class OrdinalEncoder(_BaseEncoder): >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) ... # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, + handle_unknown='error') >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 3], ['Male', 1]]) @@ -904,6 +913,23 @@ class OrdinalEncoder(_BaseEncoder): >>> enc.inverse_transform([[1, 0], [0, 1]]) array([['Male', 1], ['Female', 2]], dtype=object) + + The ordinal encoding can be set to handle unknown categories at + transform time. + + >>> from sklearn.preprocessing import OrdinalEncoder + >>> enc = OrdinalEncoder(handle_unknown='ignore') + >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] + >>> enc.fit(X) + ... # doctest: +ELLIPSIS + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, + handle_unknown='ignore') + >>> enc.transform([['Female', 4], ['Other', 1]]) + array([[ 0., -1.], + [-1., 0.]]) + >>> enc.inverse_transform([[1, -1], [0, 1]]) + array([['Male', None], + ['Female', 2]], dtype=object) See also -------- @@ -913,9 +939,13 @@ class OrdinalEncoder(_BaseEncoder): between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64): + def __init__(self, categories='auto', dtype=np.float64, + handle_unknown='error'): self.categories = categories self.dtype = dtype + self.handle_unknown = handle_unknown + self.unknown_category = None + self.unknown_value = -1 def fit(self, X, y=None): """Fit the OrdinalEncoder to X. @@ -933,7 +963,7 @@ def fit(self, X, y=None): # base classes uses _categories to deal with deprecations in # OneHoteEncoder: can be removed once deprecations are removed self._categories = self.categories - self._fit(X) + self._fit(X, handle_unknown=self.handle_unknown) return self @@ -951,7 +981,8 @@ def transform(self, X): Transformed input. """ - X_int, _ = self._transform(X) + X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) + X_int[np.logical_not(X_mask)] = self.unknown_value return X_int.astype(self.dtype, copy=False) def inverse_transform(self, X): @@ -988,6 +1019,10 @@ def inverse_transform(self, X): labels = X[:, i].astype('int64', copy=False) X_tr[:, i] = self.categories_[i][labels] + if self.handle_unknown == 'ignore': + unknown = labels == self.unknown_value + X_tr[unknown, i] = self.unknown_category + return X_tr def _more_tags(self): diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index db201dcd58c15..7be876ad0be6a 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -634,6 +634,22 @@ def test_ordinal_encoder(X): assert_array_equal(enc.fit_transform(X), exp) +@pytest.mark.parametrize("Xfit,Xtrans", [ + ([['abc', 2, 55], ['def', 1, 55]], [['ghi', 2, 55]]), + (np.array([[10, 2, 55], [20, 1, 55]]), np.array([[30, 2, 55]])), + (np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object), + np.array([['C', 'B', 'cat']], dtype=object)) + ], ids=['mixed', 'numeric', 'object']) +def test_ordinal_encoder_handle_unknown(Xfit, Xtrans): + enc = OrdinalEncoder(handle_unknown='ignore') + exp = np.array([[-1, 1, 0]], dtype='int64') + enc.fit_transform(Xfit) + assert_array_equal(enc.transform(Xtrans), exp.astype('float64')) + enc = OrdinalEncoder(dtype='int64', handle_unknown='ignore') + enc.fit(Xfit) + assert_array_equal(enc.transform(Xtrans), exp) + + @pytest.mark.parametrize("X, X2, cats, cat_dtype", [ (np.array([['a', 'b']], dtype=object).T, np.array([['a', 'd']], dtype=object).T, @@ -662,6 +678,26 @@ def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype): enc.fit(X2) +@pytest.mark.parametrize("X, cats, cat_dtype", [ + (np.array([['d', 'b']], dtype=object).T, + [['a', 'b', 'c']], np.object_), + (np.array([[4, 2]], dtype='int64').T, + [[1, 2, 3]], np.int64), + (np.array([['d', 'b']], dtype=object).T, + [np.array(['a', 'b', 'c'])], np.object_), + ], ids=['object', 'numeric', 'object-string-cat']) +def test_ordinal_encoder_specified_categories_handle_unknown(X, cats, + cat_dtype): + enc = OrdinalEncoder(categories=cats, handle_unknown='ignore') + exp = np.array([[-1.], [1.]]) + assert_array_equal(enc.fit_transform(X), exp) + assert list(enc.categories[0]) == list(cats[0]) + assert enc.categories_[0].tolist() == list(cats[0]) + # manually specified categories should have same dtype as + # the data when coerced from lists + assert enc.categories_[0].dtype == cat_dtype + + def test_ordinal_encoder_inverse(): X = [['abc', 2, 55], ['def', 1, 55]] enc = OrdinalEncoder() @@ -675,6 +711,20 @@ def test_ordinal_encoder_inverse(): assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) +def test_ordinal_encoder_inverse_handle_unknown(): + X = [['abc', 2, 55], ['def', 1, 55]] + Xu = [['ghi', 2, 55]] + enc = OrdinalEncoder(handle_unknown='ignore') + X_tr = enc.fit(X).transform(Xu) + exp = np.array([[None, 2, 55]], dtype=object) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # incorrect shape raises + X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) + msg = re.escape('Shape of the passed X data is not correct') + assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) + + @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, np.array([['a', np.nan]], dtype=object).T], ids=['numeric', 'object']) From cb35c1c94cc3914abb70a99913100fa41265cf85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois-Michel=20De=20Rainville?= Date: Thu, 16 May 2019 19:45:05 -0400 Subject: [PATCH 2/6] Fix flake8 and init arguments --- sklearn/preprocessing/_encoders.py | 21 +++++++++++++++----- sklearn/preprocessing/tests/test_encoders.py | 3 ++- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 28ddf4a59a67c..bbeaed4a36d9e 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -876,13 +876,23 @@ class OrdinalEncoder(_BaseEncoder): dtype : number type, default np.float64 Desired dtype of output. - + handle_unknown : 'error' or 'ignore', default='error'. Whether to raise an error or ignore if an unknown categorical feature is present during transform (default is to raise). When this parameter is set to 'ignore' and an unknown category is encountered during transform, a numeric ``-1`` category is returned. In the inverse transform, an unknown category will be denoted as None. + + unknown_category : object + Desired value that will take unknown value in + :meth:`~OrdinalEncoder.inverse_transform`. + + unknown_value : integer + Desired value that will take unknown value in + :meth:`~OrdinalEncoder.transform`. Warning that if this value + is in :math:`[0, N-1]`, where N is the number of categories, + it will be indistiguishable from the original category. Attributes @@ -913,7 +923,7 @@ class OrdinalEncoder(_BaseEncoder): >>> enc.inverse_transform([[1, 0], [0, 1]]) array([['Male', 1], ['Female', 2]], dtype=object) - + The ordinal encoding can be set to handle unknown categories at transform time. @@ -940,12 +950,13 @@ class OrdinalEncoder(_BaseEncoder): """ def __init__(self, categories='auto', dtype=np.float64, - handle_unknown='error'): + handle_unknown='error', unknown_category=None, + unknown_value=-1): self.categories = categories self.dtype = dtype self.handle_unknown = handle_unknown - self.unknown_category = None - self.unknown_value = -1 + self.unknown_category = unknown_category + self.unknown_value = unknown_value def fit(self, X, y=None): """Fit the OrdinalEncoder to X. diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 7be876ad0be6a..eb4ea2ab5eee7 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -687,7 +687,7 @@ def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype): [np.array(['a', 'b', 'c'])], np.object_), ], ids=['object', 'numeric', 'object-string-cat']) def test_ordinal_encoder_specified_categories_handle_unknown(X, cats, - cat_dtype): + cat_dtype): enc = OrdinalEncoder(categories=cats, handle_unknown='ignore') exp = np.array([[-1.], [1.]]) assert_array_equal(enc.fit_transform(X), exp) @@ -753,6 +753,7 @@ def test_ordinal_encoder_raise_categories_shape(): with pytest.raises(ValueError, match=msg): enc.fit(X) + def test_encoder_dtypes(): # check that dtypes are preserved when determining categories enc = OneHotEncoder(categories='auto') From 3cfe016a9dd2988084d5e4bb794bc2070f8035a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois-Michel=20De=20Rainville?= Date: Thu, 16 May 2019 19:47:24 -0400 Subject: [PATCH 3/6] Removed white spaces --- sklearn/preprocessing/_encoders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index bbeaed4a36d9e..17667f0c93312 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -883,11 +883,11 @@ class OrdinalEncoder(_BaseEncoder): is set to 'ignore' and an unknown category is encountered during transform, a numeric ``-1`` category is returned. In the inverse transform, an unknown category will be denoted as None. - + unknown_category : object Desired value that will take unknown value in :meth:`~OrdinalEncoder.inverse_transform`. - + unknown_value : integer Desired value that will take unknown value in :meth:`~OrdinalEncoder.transform`. Warning that if this value From a786ce7c2820ada9a781ee32db5f23ef1e9c4a14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois-Michel=20De=20Rainville?= Date: Thu, 16 May 2019 20:25:47 -0400 Subject: [PATCH 4/6] Corrects doctest --- sklearn/preprocessing/_encoders.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 17667f0c93312..7aed0f06c983b 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -913,7 +913,7 @@ class OrdinalEncoder(_BaseEncoder): >>> enc.fit(X) ... # doctest: +ELLIPSIS OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, - handle_unknown='error') + handle_unknown='error', unknown_category=None, unknown_value=-1) >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 3], ['Male', 1]]) @@ -933,7 +933,8 @@ class OrdinalEncoder(_BaseEncoder): >>> enc.fit(X) ... # doctest: +ELLIPSIS OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, - handle_unknown='ignore') + handle_unknown='ignore', unknown_category=None, + unknown_value=-1) >>> enc.transform([['Female', 4], ['Other', 1]]) array([[ 0., -1.], [-1., 0.]]) From 573a7516b77c40f4a4dc50b69ea833f9532822d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois-Michel=20De=20Rainville?= Date: Thu, 16 May 2019 20:35:38 -0400 Subject: [PATCH 5/6] flake8 and doctest conflict for line length --- sklearn/preprocessing/_encoders.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 7aed0f06c983b..215b00bc1bfe9 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -913,7 +913,7 @@ class OrdinalEncoder(_BaseEncoder): >>> enc.fit(X) ... # doctest: +ELLIPSIS OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, - handle_unknown='error', unknown_category=None, unknown_value=-1) + handle_unknown='error', ...) >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 3], ['Male', 1]]) @@ -933,8 +933,7 @@ class OrdinalEncoder(_BaseEncoder): >>> enc.fit(X) ... # doctest: +ELLIPSIS OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, - handle_unknown='ignore', unknown_category=None, - unknown_value=-1) + handle_unknown='ignore', ...) >>> enc.transform([['Female', 4], ['Other', 1]]) array([[ 0., -1.], [-1., 0.]]) From 4ef1c8a367981c0fa9d5fbf5b0a8055283395bf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois-Michel=20De=20Rainville?= Date: Wed, 22 May 2019 06:19:14 -0400 Subject: [PATCH 6/6] Fixed doctest --- doc/modules/preprocessing.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4c68f9e635498..b43a4d6871b77 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -481,8 +481,9 @@ new feature of integers (0 to n_categories - 1):: >>> enc = preprocessing.OrdinalEncoder() >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] - >>> enc.fit(X) # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) + >>> enc.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + OrdinalEncoder(categories='auto', dtype=, + handle_unknown='error', unknown_category=None, unknown_value=-1) >>> enc.transform([['female', 'from US', 'uses Safari']]) array([[0., 1., 1.]]) @@ -561,8 +562,8 @@ parameter allows the user to specify a category for each feature to be dropped. This is useful to avoid co-linearity in the input matrix in some classifiers. Such functionality is useful, for example, when using non-regularized regression (:class:`LinearRegression `), -since co-linearity would cause the covariance matrix to be non-invertible. -When this paramenter is not None, ``handle_unknown`` must be set to +since co-linearity would cause the covariance matrix to be non-invertible. +When this paramenter is not None, ``handle_unknown`` must be set to ``error``:: >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]