FIX Fixes bug OneHotEncoder's drop_idx_ when there are infrequent categories (scikit-learn#25589)

thomasjpfan · glemaitre · jeremiedbb · jeremiedbb · commit b000feb8d33b · 2023-03-08T13:19:15.000+01:00
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
Co-authored-by: Jérémie du Boisberranger &lt;34657725+jeremiedbb@users.noreply.github.com&gt;
Co-authored-by: Julien Jerphanion &lt;git@jjerphan.xyz&gt;
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -84,6 +84,10 @@ Changelog
 :mod:`sklearn.preprocessing`
 ............................
 
+- |Fix| :attr:`preprocessing.OneHotEncoder.drop_idx_` now properly 
+  references the dropped category in the `categories_` attribute
+  when there are infrequent categories. :pr:`25589` by `Thomas Fan`_.
+
 - |Fix| :class:`preprocessing.OrdinalEncoder` now correctly supports
   `encoded_missing_value` or `unknown_value` set to a categories' cardinality
   when there is missing values in the training data. :pr:`25704` by `Thomas Fan`_.
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -270,6 +270,10 @@ class OneHotEncoder(_BaseEncoder):
         - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
           should be dropped.
 
+        When `max_categories` or `min_frequency` is configured to group
+        infrequent categories, the dropping behavior is handled after the
+        grouping.
+
         .. versionadded:: 0.21
            The parameter `drop` was added in 0.21.
 
@@ -514,7 +518,7 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
         """Convert `drop_idx` into the index for infrequent categories.
 
         If there are no infrequent categories, then `drop_idx` is
-        returned. This method is called in `_compute_drop_idx` when the `drop`
+        returned. This method is called in `_set_drop_idx` when the `drop`
         parameter is an array-like.
         """
         if not self._infrequent_enabled:
@@ -534,24 +538,35 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
             )
         return default_to_infrequent[drop_idx]
 
-    def _compute_drop_idx(self):
+    def _set_drop_idx(self):
         """Compute the drop indices associated with `self.categories_`.
 
         If `self.drop` is:
-        - `None`, returns `None`.
-        - `'first'`, returns all zeros to drop the first category.
-        - `'if_binary'`, returns zero if the category is binary and `None`
+        - `None`, No categories have been dropped.
+        - `'first'`, All zeros to drop the first category.
+        - `'if_binary'`, All zeros if the category is binary and `None`
           otherwise.
-        - array-like, returns the indices of the categories that match the
+        - array-like, The indices of the categories that match the
           categories in `self.drop`. If the dropped category is an infrequent
           category, then the index for the infrequent category is used. This
           means that the entire infrequent category is dropped.
+
+        This methods defines a public `drop_idx_` and a private
+        `_drop_idx_after_grouping`.
+
+        - `drop_idx_`: Public facing API that references the drop category in
+          `self.categories_`.
+        - `_drop_idx_after_grouping`: Used internally to drop categories *after* the
+          infrequent categories are grouped together.
+
+        If there are no infrequent categories or drop is `None`, then
+        `drop_idx_=_drop_idx_after_grouping`.
         """
         if self.drop is None:
-            return None
+            drop_idx_after_grouping = None
         elif isinstance(self.drop, str):
             if self.drop == "first":
-                return np.zeros(len(self.categories_), dtype=object)
+                drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)
             elif self.drop == "if_binary":
                 n_features_out_no_drop = [len(cat) for cat in self.categories_]
                 if self._infrequent_enabled:
@@ -560,7 +575,7 @@ def _compute_drop_idx(self):
                             continue
                         n_features_out_no_drop[i] -= infreq_idx.size - 1
 
-                return np.array(
+                drop_idx_after_grouping = np.array(
                     [
                         0 if n_features_out == 2 else None
                         for n_features_out in n_features_out_no_drop
@@ -617,7 +632,29 @@ def _compute_drop_idx(self):
                     )
                 )
                 raise ValueError(msg)
-            return np.array(drop_indices, dtype=object)
+            drop_idx_after_grouping = np.array(drop_indices, dtype=object)
+
+        # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
+        # categories are grouped together. If needed, we remap `drop_idx` back
+        # to the categories seen in `self.categories_`.
+        self._drop_idx_after_grouping = drop_idx_after_grouping
+
+        if not self._infrequent_enabled or drop_idx_after_grouping is None:
+            self.drop_idx_ = self._drop_idx_after_grouping
+        else:
+            drop_idx_ = []
+            for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):
+                default_to_infrequent = self._default_to_infrequent_mappings[
+                    feature_idx
+                ]
+                if drop_idx is None or default_to_infrequent is None:
+                    orig_drop_idx = drop_idx
+                else:
+                    orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]
+
+                drop_idx_.append(orig_drop_idx)
+
+            self.drop_idx_ = np.asarray(drop_idx_, dtype=object)
 
     def _identify_infrequent(self, category_count, n_samples, col_idx):
         """Compute the infrequent indices.
@@ -779,16 +816,19 @@ def _compute_transformed_categories(self, i, remove_dropped=True):
 
     def _remove_dropped_categories(self, categories, i):
         """Remove dropped categories."""
-        if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
-            return np.delete(categories, self.drop_idx_[i])
+        if (
+            self._drop_idx_after_grouping is not None
+            and self._drop_idx_after_grouping[i] is not None
+        ):
+            return np.delete(categories, self._drop_idx_after_grouping[i])
         return categories
 
     def _compute_n_features_outs(self):
         """Compute the n_features_out for each input feature."""
         output = [len(cats) for cats in self.categories_]
 
-        if self.drop_idx_ is not None:
-            for i, drop_idx in enumerate(self.drop_idx_):
+        if self._drop_idx_after_grouping is not None:
+            for i, drop_idx in enumerate(self._drop_idx_after_grouping):
                 if drop_idx is not None:
                     output[i] -= 1
 
@@ -845,7 +885,7 @@ def fit(self, X, y=None):
             self._fit_infrequent_category_mapping(
                 fit_results["n_samples"], fit_results["category_counts"]
             )
-        self.drop_idx_ = self._compute_drop_idx()
+        self._set_drop_idx()
         self._n_features_outs = self._compute_n_features_outs()
         return self
 
@@ -884,8 +924,8 @@ def transform(self, X):
 
         n_samples, n_features = X_int.shape
 
-        if self.drop_idx_ is not None:
-            to_drop = self.drop_idx_.copy()
+        if self._drop_idx_after_grouping is not None:
+            to_drop = self._drop_idx_after_grouping.copy()
             # We remove all the dropped categories from mask, and decrement all
             # categories that occur after them to avoid an empty column.
             keep_cells = X_int != to_drop
@@ -984,7 +1024,7 @@ def inverse_transform(self, X):
             # category. In this case we just fill the column with this
             # unique category value.
             if n_categories == 0:
-                X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]
+                X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]
                 j += n_categories
                 continue
             sub = X[:, j : j + n_categories]
@@ -1001,14 +1041,19 @@ def inverse_transform(self, X):
                 if unknown.any():
                     # if categories were dropped then unknown categories will
                     # be mapped to the dropped category
-                    if self.drop_idx_ is None or self.drop_idx_[i] is None:
+                    if (
+                        self._drop_idx_after_grouping is None
+                        or self._drop_idx_after_grouping[i] is None
+                    ):
                         found_unknown[i] = unknown
                     else:
-                        X_tr[unknown, i] = self.categories_[i][self.drop_idx_[i]]
+                        X_tr[unknown, i] = self.categories_[i][
+                            self._drop_idx_after_grouping[i]
+                        ]
             else:
                 dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
                 if dropped.any():
-                    if self.drop_idx_ is None:
+                    if self._drop_idx_after_grouping is None:
                         all_zero_samples = np.flatnonzero(dropped)
                         raise ValueError(
                             f"Samples {all_zero_samples} can not be inverted "
@@ -1017,7 +1062,7 @@ def inverse_transform(self, X):
                         )
                     # we can safely assume that all of the nulls in each column
                     # are the dropped value
-                    drop_idx = self.drop_idx_[i]
+                    drop_idx = self._drop_idx_after_grouping[i]
                     X_tr[dropped, i] = transformed_features[i][drop_idx]
 
             j += n_categories
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -903,7 +903,7 @@ def test_ohe_infrequent_two_levels_drop_frequent(drop):
         max_categories=2,
         drop=drop,
     ).fit(X_train)
-    assert_array_equal(ohe.drop_idx_, [0])
+    assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
 
     X_test = np.array([["b"], ["c"]])
     X_trans = ohe.transform(X_test)
@@ -1989,3 +1989,39 @@ def test_ordinal_encoder_missing_unknown_encoding_max():
     X_test = np.array([["snake"]])
     X_trans = enc.transform(X_test)
     assert_allclose(X_trans, [[2]])
+
+
+def test_drop_idx_infrequent_categories():
+    """Check drop_idx is defined correctly with infrequent categories.
+
+    Non-regression test for gh-25550.
+    """
+    X = np.array(
+        [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
+    ).T
+    ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X)
+    assert_array_equal(
+        ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"]
+    )
+    assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
+
+    X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T
+    ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X)
+    assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"])
+    assert ohe.categories_[0][ohe.drop_idx_[0]] == "c"
+
+    X = np.array(
+        [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
+    ).T
+    ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X)
+    assert_array_equal(
+        ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"]
+    )
+    assert ohe.categories_[0][ohe.drop_idx_[0]] == "d"
+
+    ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X)
+    assert_array_equal(
+        ohe.get_feature_names_out(),
+        ["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"],
+    )
+    assert ohe.drop_idx_ is None