Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c9c89cf

Browse files
thomasjpfanogrisel
andauthored
ENH Adds support for drop + handle_unknown=ignore in the OneHotEncoder (#19041)
Co-authored-by: Olivier Grisel <[email protected]>
1 parent 57d3668 commit c9c89cf

File tree

4 files changed

+140
-18
lines changed

4 files changed

+140
-18
lines changed

doc/modules/preprocessing.rst

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -560,9 +560,7 @@ parameter allows the user to specify a category for each feature to be dropped.
560560
This is useful to avoid co-linearity in the input matrix in some classifiers.
561561
Such functionality is useful, for example, when using non-regularized
562562
regression (:class:`LinearRegression <sklearn.linear_model.LinearRegression>`),
563-
since co-linearity would cause the covariance matrix to be non-invertible.
564-
When this parameter is not None, ``handle_unknown`` must be set to
565-
``error``::
563+
since co-linearity would cause the covariance matrix to be non-invertible::
566564

567565
>>> X = [['male', 'from US', 'uses Safari'],
568566
... ['female', 'from Europe', 'uses Firefox']]
@@ -591,6 +589,30 @@ In the transformed `X`, the first column is the encoding of the feature with
591589
categories "male"/"female", while the remaining 6 columns is the encoding of
592590
the 2 features with respectively 3 categories each.
593591

592+
When `handle_unknown='ignore'` and `drop` is not None, unknown categories will
593+
be encoded as all zeros::
594+
595+
>>> drop_enc = preprocessing.OneHotEncoder(drop='first',
596+
... handle_unknown='ignore').fit(X)
597+
>>> X_test = [['unknown', 'America', 'IE']]
598+
>>> drop_enc.transform(X_test).toarray()
599+
array([[0., 0., 0., 0., 0.]])
600+
601+
All the categories in `X_test` are unknown during transform and will be mapped
602+
to all zeros. This means that unknown categories will have the same mapping as
603+
the dropped category. :meth`OneHotEncoder.inverse_transform` will map all zeros
604+
to the dropped category if a category is dropped and `None` if a category is
605+
not dropped::
606+
607+
>>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse=False,
608+
... handle_unknown='ignore').fit(X)
609+
>>> X_test = [['unknown', 'America', 'IE']]
610+
>>> X_trans = drop_enc.transform(X_test)
611+
>>> X_trans
612+
array([[0., 0., 0., 0., 0., 0., 0.]])
613+
>>> drop_enc.inverse_transform(X_trans)
614+
array([['female', None, None]], dtype=object)
615+
594616
:class:`OneHotEncoder` supports categorical features with missing values by
595617
considering the missing values as an additional category::
596618

doc/whats_new/v1.0.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,13 @@ Changelog
7979
:mod:`sklearn.cluster`
8080
......................
8181

82+
:mod:`sklearn.preprocessing`
83+
............................
84+
85+
- |Feature| :class:`preprocessing.OneHotEncoder` now supports
86+
`handle_unknown='ignore'` and dropping categories. :pr:`19041` by
87+
`Thomas Fan`_.
88+
8289
- |Efficiency| The "k-means++" initialization of :class:`cluster.KMeans` and
8390
:class:`cluster.MiniBatchKMeans` is now faster, especially in multicore
8491
settings. :pr:`19002` by :user:`Jon Crall <Erotemic>` and

sklearn/preprocessing/_encoders.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Joris Van den Bossche <[email protected]>
33
# License: BSD 3 clause
44

5+
import warnings
56
import numpy as np
67
from scipy import sparse
78
import numbers
@@ -110,7 +111,8 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
110111
raise ValueError(msg)
111112
self.categories_.append(cats)
112113

113-
def _transform(self, X, handle_unknown='error', force_all_finite=True):
114+
def _transform(self, X, handle_unknown='error', force_all_finite=True,
115+
warn_on_unknown=False):
114116
X_list, n_samples, n_features = self._check_X(
115117
X, force_all_finite=force_all_finite)
116118

@@ -125,6 +127,7 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True):
125127
.format(len(self.categories_,), n_features)
126128
)
127129

130+
columns_with_unknown = []
128131
for i in range(n_features):
129132
Xi = X_list[i]
130133
diff, valid_mask = _check_unknown(Xi, self.categories_[i],
@@ -136,6 +139,8 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True):
136139
" during transform".format(diff, i))
137140
raise ValueError(msg)
138141
else:
142+
if warn_on_unknown:
143+
columns_with_unknown.append(i)
139144
# Set the problematic rows to an acceptable value and
140145
# continue `The rows are marked `X_mask` and will be
141146
# removed later.
@@ -153,6 +158,11 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True):
153158
# already called above.
154159
X_int[:, i] = _encode(Xi, uniques=self.categories_[i],
155160
check_unknown=False)
161+
if columns_with_unknown:
162+
warnings.warn("Found unknown categories in columns "
163+
f"{columns_with_unknown} during transform. These "
164+
"unknown categories will be encoded as all zeros",
165+
UserWarning)
156166

157167
return X_int, X_mask
158168

@@ -327,14 +337,6 @@ def _validate_keywords(self):
327337
msg = ("handle_unknown should be either 'error' or 'ignore', "
328338
"got {0}.".format(self.handle_unknown))
329339
raise ValueError(msg)
330-
# If we have both dropped columns and ignored unknown
331-
# values, there will be ambiguous cells. This creates difficulties
332-
# in interpreting the model.
333-
if self.drop is not None and self.handle_unknown != 'error':
334-
raise ValueError(
335-
"`handle_unknown` must be 'error' when the drop parameter is "
336-
"specified, as both would create categories that are all "
337-
"zero.")
338340

339341
def _compute_drop_idx(self):
340342
if self.drop is None:
@@ -459,8 +461,11 @@ def transform(self, X):
459461
"""
460462
check_is_fitted(self)
461463
# validation of X happens in _check_X called by _transform
464+
warn_on_unknown = (self.handle_unknown == "ignore"
465+
and self.drop is not None)
462466
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
463-
force_all_finite='allow-nan')
467+
force_all_finite='allow-nan',
468+
warn_on_unknown=warn_on_unknown)
464469

465470
n_samples, n_features = X_int.shape
466471

@@ -509,8 +514,10 @@ def inverse_transform(self, X):
509514
"""
510515
Convert the data back to the original representation.
511516
512-
In case unknown categories are encountered (all zeros in the
513-
one-hot encoding), ``None`` is used to represent this category.
517+
When unknown categories are encountered (all zeros in the
518+
one-hot encoding), ``None`` is used to represent this category. If the
519+
feature with the unknown category has a dropped caregory, the dropped
520+
category will be its inverse.
514521
515522
Parameters
516523
----------
@@ -571,7 +578,14 @@ def inverse_transform(self, X):
571578
unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
572579
# ignored unknown categories: we have a row of all zero
573580
if unknown.any():
574-
found_unknown[i] = unknown
581+
# if categories were dropped then unknown categories will
582+
# be mapped to the dropped category
583+
if self.drop_idx_ is None or self.drop_idx_[i] is None:
584+
found_unknown[i] = unknown
585+
else:
586+
X_tr[unknown, i] = self.categories_[i][
587+
self.drop_idx_[i]
588+
]
575589
else:
576590
dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
577591
if dropped.any():

sklearn/preprocessing/tests/test_encoders.py

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -775,8 +775,6 @@ def test_one_hot_encoder_drop_manual(missing_value):
775775
"X_fit, params, err_msg",
776776
[([["Male"], ["Female"]], {'drop': 'second'},
777777
"Wrong input for parameter `drop`"),
778-
([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
779-
"`handle_unknown` must be 'error'"),
780778
([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
781779
{'drop': np.asarray('b', dtype=object)},
782780
"Wrong input for parameter `drop`"),
@@ -914,6 +912,87 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type):
914912
assert np.isnan(ohe.categories_[0][-1])
915913

916914

915+
def test_ohe_drop_first_handle_unknown_ignore_warns():
916+
"""Check drop='first' and handle_unknown='ignore' during transform."""
917+
X = [['a', 0], ['b', 2], ['b', 1]]
918+
919+
ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
920+
X_trans = ohe.fit_transform(X)
921+
922+
X_expected = np.array([
923+
[0, 0, 0],
924+
[1, 0, 1],
925+
[1, 1, 0],
926+
])
927+
assert_allclose(X_trans, X_expected)
928+
929+
# Both categories are unknown
930+
X_test = [['c', 3]]
931+
X_expected = np.array([[0, 0, 0]])
932+
933+
warn_msg = (r"Found unknown categories in columns \[0, 1\] during "
934+
"transform. These unknown categories will be encoded as all "
935+
"zeros")
936+
with pytest.warns(UserWarning, match=warn_msg):
937+
X_trans = ohe.transform(X_test)
938+
assert_allclose(X_trans, X_expected)
939+
940+
# inverse_transform maps to None
941+
X_inv = ohe.inverse_transform(X_expected)
942+
assert_array_equal(X_inv, np.array([['a', 0]], dtype=object))
943+
944+
945+
def test_ohe_drop_if_binary_handle_unknown_ignore_warns():
946+
"""Check drop='if_binary' and handle_unknown='ignore' during transform."""
947+
X = [['a', 0], ['b', 2], ['b', 1]]
948+
949+
ohe = OneHotEncoder(drop='if_binary', sparse=False,
950+
handle_unknown='ignore')
951+
X_trans = ohe.fit_transform(X)
952+
953+
X_expected = np.array([
954+
[0, 1, 0, 0],
955+
[1, 0, 0, 1],
956+
[1, 0, 1, 0],
957+
])
958+
assert_allclose(X_trans, X_expected)
959+
960+
# Both categories are unknown
961+
X_test = [['c', 3]]
962+
X_expected = np.array([[0, 0, 0, 0]])
963+
964+
warn_msg = (r"Found unknown categories in columns \[0, 1\] during "
965+
"transform. These unknown categories will be encoded as all "
966+
"zeros")
967+
with pytest.warns(UserWarning, match=warn_msg):
968+
X_trans = ohe.transform(X_test)
969+
assert_allclose(X_trans, X_expected)
970+
971+
# inverse_transform maps to None
972+
X_inv = ohe.inverse_transform(X_expected)
973+
assert_array_equal(X_inv, np.array([['a', None]], dtype=object))
974+
975+
976+
def test_ohe_drop_first_explicit_categories():
977+
"""Check drop='first' and handle_unknown='ignore' during fit with
978+
categories passed in."""
979+
980+
X = [['a', 0], ['b', 2], ['b', 1]]
981+
982+
ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore',
983+
categories=[['b', 'a'], [1, 2]])
984+
ohe.fit(X)
985+
986+
X_test = [['c', 1]]
987+
X_expected = np.array([[0, 0]])
988+
989+
warn_msg = (r"Found unknown categories in columns \[0\] during transform. "
990+
r"These unknown categories will be encoded as all zeros")
991+
with pytest.warns(UserWarning, match=warn_msg):
992+
X_trans = ohe.transform(X_test)
993+
assert_allclose(X_trans, X_expected)
994+
995+
917996
def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
918997
"""Test ordinal encoder with nan passthrough fails when dtype=np.int32."""
919998

0 commit comments

Comments
 (0)