ENH Adds support for drop + handle_unknown=ignore in the OneHotEncoder (#19041)

thomasjpfan · ogrisel · web-flow · commit c9c89cfc85dd · 2021-03-31T16:45:38.000+02:00
Co-authored-by: Olivier Grisel &lt;olivier.grisel@gmail.com&gt;
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -560,9 +560,7 @@ parameter allows the user to specify a category for each feature to be dropped.
 This is useful to avoid co-linearity in the input matrix in some classifiers.
 Such functionality is useful, for example, when using non-regularized
 regression (:class:`LinearRegression <sklearn.linear_model.LinearRegression>`),
-since co-linearity would cause the covariance matrix to be non-invertible.
-When this parameter is not None, ``handle_unknown`` must be set to
-``error``::
+since co-linearity would cause the covariance matrix to be non-invertible::
 
     >>> X = [['male', 'from US', 'uses Safari'],
     ...      ['female', 'from Europe', 'uses Firefox']]
@@ -591,6 +589,30 @@ In the transformed `X`, the first column is the encoding of the feature with
 categories "male"/"female", while the remaining 6 columns is the encoding of
 the 2 features with respectively 3 categories each.
 
+When `handle_unknown='ignore'` and `drop` is not None, unknown categories will
+be encoded as all zeros::
+
+    >>> drop_enc = preprocessing.OneHotEncoder(drop='first',
+    ...                                        handle_unknown='ignore').fit(X)
+    >>> X_test = [['unknown', 'America', 'IE']]
+    >>> drop_enc.transform(X_test).toarray()
+    array([[0., 0., 0., 0., 0.]])
+
+All the categories in `X_test` are unknown during transform and will be mapped
+to all zeros. This means that unknown categories will have the same mapping as
+the dropped category. :meth`OneHotEncoder.inverse_transform` will map all zeros
+to the dropped category if a category is dropped and `None` if a category is
+not dropped::
+
+    >>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse=False,
+    ...                                        handle_unknown='ignore').fit(X)
+    >>> X_test = [['unknown', 'America', 'IE']]
+    >>> X_trans = drop_enc.transform(X_test)
+    >>> X_trans
+    array([[0., 0., 0., 0., 0., 0., 0.]])
+    >>> drop_enc.inverse_transform(X_trans)
+    array([['female', None, None]], dtype=object)
+
 :class:`OneHotEncoder` supports categorical features with missing values by
 considering the missing values as an additional category::
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -79,6 +79,13 @@ Changelog
 :mod:`sklearn.cluster`
 ......................
 
+:mod:`sklearn.preprocessing`
+............................
+
+- |Feature| :class:`preprocessing.OneHotEncoder` now supports
+  `handle_unknown='ignore'` and dropping categories. :pr:`19041` by
+  `Thomas Fan`_.
+
 - |Efficiency| The "k-means++" initialization of :class:`cluster.KMeans` and
   :class:`cluster.MiniBatchKMeans` is now faster, especially in multicore
   settings. :pr:`19002` by :user:`Jon Crall <Erotemic>` and
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -2,6 +2,7 @@
 #          Joris Van den Bossche <jorisvandenbossche@gmail.com>
 # License: BSD 3 clause
 
+import warnings
 import numpy as np
 from scipy import sparse
 import numbers
@@ -110,7 +111,8 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
                         raise ValueError(msg)
             self.categories_.append(cats)
 
-    def _transform(self, X, handle_unknown='error', force_all_finite=True):
+    def _transform(self, X, handle_unknown='error', force_all_finite=True,
+                   warn_on_unknown=False):
         X_list, n_samples, n_features = self._check_X(
             X, force_all_finite=force_all_finite)
 
@@ -125,6 +127,7 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True):
                 .format(len(self.categories_,), n_features)
             )
 
+        columns_with_unknown = []
         for i in range(n_features):
             Xi = X_list[i]
             diff, valid_mask = _check_unknown(Xi, self.categories_[i],
@@ -136,6 +139,8 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True):
                            " during transform".format(diff, i))
                     raise ValueError(msg)
                 else:
+                    if warn_on_unknown:
+                        columns_with_unknown.append(i)
                     # Set the problematic rows to an acceptable value and
                     # continue `The rows are marked `X_mask` and will be
                     # removed later.
@@ -153,6 +158,11 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True):
             # already called above.
             X_int[:, i] = _encode(Xi, uniques=self.categories_[i],
                                   check_unknown=False)
+        if columns_with_unknown:
+            warnings.warn("Found unknown categories in columns "
+                          f"{columns_with_unknown} during transform. These "
+                          "unknown categories will be encoded as all zeros",
+                          UserWarning)
 
         return X_int, X_mask
 
@@ -327,14 +337,6 @@ def _validate_keywords(self):
             msg = ("handle_unknown should be either 'error' or 'ignore', "
                    "got {0}.".format(self.handle_unknown))
             raise ValueError(msg)
-        # If we have both dropped columns and ignored unknown
-        # values, there will be ambiguous cells. This creates difficulties
-        # in interpreting the model.
-        if self.drop is not None and self.handle_unknown != 'error':
-            raise ValueError(
-                "`handle_unknown` must be 'error' when the drop parameter is "
-                "specified, as both would create categories that are all "
-                "zero.")
 
     def _compute_drop_idx(self):
         if self.drop is None:
@@ -459,8 +461,11 @@ def transform(self, X):
         """
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
+        warn_on_unknown = (self.handle_unknown == "ignore"
+                           and self.drop is not None)
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
-                                        force_all_finite='allow-nan')
+                                        force_all_finite='allow-nan',
+                                        warn_on_unknown=warn_on_unknown)
 
         n_samples, n_features = X_int.shape
 
@@ -509,8 +514,10 @@ def inverse_transform(self, X):
         """
         Convert the data back to the original representation.
 
-        In case unknown categories are encountered (all zeros in the
-        one-hot encoding), ``None`` is used to represent this category.
+        When unknown categories are encountered (all zeros in the
+        one-hot encoding), ``None`` is used to represent this category. If the
+        feature with the unknown category has a dropped caregory, the dropped
+        category will be its inverse.
 
         Parameters
         ----------
@@ -571,7 +578,14 @@ def inverse_transform(self, X):
                 unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
                 # ignored unknown categories: we have a row of all zero
                 if unknown.any():
-                    found_unknown[i] = unknown
+                    # if categories were dropped then unknown categories will
+                    # be mapped to the dropped category
+                    if self.drop_idx_ is None or self.drop_idx_[i] is None:
+                        found_unknown[i] = unknown
+                    else:
+                        X_tr[unknown, i] = self.categories_[i][
+                            self.drop_idx_[i]
+                        ]
             else:
                 dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
                 if dropped.any():
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -775,8 +775,6 @@ def test_one_hot_encoder_drop_manual(missing_value):
     "X_fit, params, err_msg",
     [([["Male"], ["Female"]], {'drop': 'second'},
      "Wrong input for parameter `drop`"),
-     ([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
-     "`handle_unknown` must be 'error'"),
      ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
       {'drop': np.asarray('b', dtype=object)},
      "Wrong input for parameter `drop`"),
@@ -914,6 +912,87 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type):
     assert np.isnan(ohe.categories_[0][-1])
 
 
+def test_ohe_drop_first_handle_unknown_ignore_warns():
+    """Check drop='first' and handle_unknown='ignore' during transform."""
+    X = [['a', 0], ['b', 2], ['b', 1]]
+
+    ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
+    X_trans = ohe.fit_transform(X)
+
+    X_expected = np.array([
+        [0, 0, 0],
+        [1, 0, 1],
+        [1, 1, 0],
+    ])
+    assert_allclose(X_trans, X_expected)
+
+    # Both categories are unknown
+    X_test = [['c', 3]]
+    X_expected = np.array([[0, 0, 0]])
+
+    warn_msg = (r"Found unknown categories in columns \[0, 1\] during "
+                "transform. These unknown categories will be encoded as all "
+                "zeros")
+    with pytest.warns(UserWarning, match=warn_msg):
+        X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, X_expected)
+
+    # inverse_transform maps to None
+    X_inv = ohe.inverse_transform(X_expected)
+    assert_array_equal(X_inv, np.array([['a', 0]], dtype=object))
+
+
+def test_ohe_drop_if_binary_handle_unknown_ignore_warns():
+    """Check drop='if_binary' and handle_unknown='ignore' during transform."""
+    X = [['a', 0], ['b', 2], ['b', 1]]
+
+    ohe = OneHotEncoder(drop='if_binary', sparse=False,
+                        handle_unknown='ignore')
+    X_trans = ohe.fit_transform(X)
+
+    X_expected = np.array([
+        [0, 1, 0, 0],
+        [1, 0, 0, 1],
+        [1, 0, 1, 0],
+    ])
+    assert_allclose(X_trans, X_expected)
+
+    # Both categories are unknown
+    X_test = [['c', 3]]
+    X_expected = np.array([[0, 0, 0, 0]])
+
+    warn_msg = (r"Found unknown categories in columns \[0, 1\] during "
+                "transform. These unknown categories will be encoded as all "
+                "zeros")
+    with pytest.warns(UserWarning, match=warn_msg):
+        X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, X_expected)
+
+    # inverse_transform maps to None
+    X_inv = ohe.inverse_transform(X_expected)
+    assert_array_equal(X_inv, np.array([['a', None]], dtype=object))
+
+
+def test_ohe_drop_first_explicit_categories():
+    """Check drop='first' and handle_unknown='ignore' during fit with
+    categories passed in."""
+
+    X = [['a', 0], ['b', 2], ['b', 1]]
+
+    ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore',
+                        categories=[['b', 'a'], [1, 2]])
+    ohe.fit(X)
+
+    X_test = [['c', 1]]
+    X_expected = np.array([[0, 0]])
+
+    warn_msg = (r"Found unknown categories in columns \[0\] during transform. "
+                r"These unknown categories will be encoded as all zeros")
+    with pytest.warns(UserWarning, match=warn_msg):
+        X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, X_expected)
+
+
 def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
     """Test ordinal encoder with nan passthrough fails when dtype=np.int32."""