@@ -270,6 +270,10 @@ class OneHotEncoder(_BaseEncoder):
270270 - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
271271 should be dropped.
272272
273+ When `max_categories` or `min_frequency` is configured to group
274+ infrequent categories, the dropping behavior is handled after the
275+ grouping.
276+
273277 .. versionadded:: 0.21
274278 The parameter `drop` was added in 0.21.
275279
@@ -544,7 +548,7 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
544548 """Convert `drop_idx` into the index for infrequent categories.
545549
546550 If there are no infrequent categories, then `drop_idx` is
547- returned. This method is called in `_compute_drop_idx ` when the `drop`
551+ returned. This method is called in `_set_drop_idx ` when the `drop`
548552 parameter is an array-like.
549553 """
550554 if not self ._infrequent_enabled :
@@ -564,24 +568,35 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
564568 )
565569 return default_to_infrequent [drop_idx ]
566570
567- def _compute_drop_idx (self ):
571+ def _set_drop_idx (self ):
568572 """Compute the drop indices associated with `self.categories_`.
569573
570574 If `self.drop` is:
571- - `None`, returns `None` .
572- - `'first'`, returns all zeros to drop the first category.
573- - `'if_binary'`, returns zero if the category is binary and `None`
575+ - `None`, No categories have been dropped .
576+ - `'first'`, All zeros to drop the first category.
577+ - `'if_binary'`, All zeros if the category is binary and `None`
574578 otherwise.
575- - array-like, returns the indices of the categories that match the
579+ - array-like, The indices of the categories that match the
576580 categories in `self.drop`. If the dropped category is an infrequent
577581 category, then the index for the infrequent category is used. This
578582 means that the entire infrequent category is dropped.
583+
584+ This methods defines a public `drop_idx_` and a private
585+ `_drop_idx_after_grouping`.
586+
587+ - `drop_idx_`: Public facing API that references the drop category in
588+ `self.categories_`.
589+ - `_drop_idx_after_grouping`: Used internally to drop categories *after* the
590+ infrequent categories are grouped together.
591+
592+ If there are no infrequent categories or drop is `None`, then
593+ `drop_idx_=_drop_idx_after_grouping`.
579594 """
580595 if self .drop is None :
581- return None
596+ drop_idx_after_grouping = None
582597 elif isinstance (self .drop , str ):
583598 if self .drop == "first" :
584- return np .zeros (len (self .categories_ ), dtype = object )
599+ drop_idx_after_grouping = np .zeros (len (self .categories_ ), dtype = object )
585600 elif self .drop == "if_binary" :
586601 n_features_out_no_drop = [len (cat ) for cat in self .categories_ ]
587602 if self ._infrequent_enabled :
@@ -590,7 +605,7 @@ def _compute_drop_idx(self):
590605 continue
591606 n_features_out_no_drop [i ] -= infreq_idx .size - 1
592607
593- return np .array (
608+ drop_idx_after_grouping = np .array (
594609 [
595610 0 if n_features_out == 2 else None
596611 for n_features_out in n_features_out_no_drop
@@ -647,7 +662,29 @@ def _compute_drop_idx(self):
647662 )
648663 )
649664 raise ValueError (msg )
650- return np .array (drop_indices , dtype = object )
665+ drop_idx_after_grouping = np .array (drop_indices , dtype = object )
666+
667+ # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
668+ # categories are grouped together. If needed, we remap `drop_idx` back
669+ # to the categories seen in `self.categories_`.
670+ self ._drop_idx_after_grouping = drop_idx_after_grouping
671+
672+ if not self ._infrequent_enabled or drop_idx_after_grouping is None :
673+ self .drop_idx_ = self ._drop_idx_after_grouping
674+ else :
675+ drop_idx_ = []
676+ for feature_idx , drop_idx in enumerate (drop_idx_after_grouping ):
677+ default_to_infrequent = self ._default_to_infrequent_mappings [
678+ feature_idx
679+ ]
680+ if drop_idx is None or default_to_infrequent is None :
681+ orig_drop_idx = drop_idx
682+ else :
683+ orig_drop_idx = np .flatnonzero (default_to_infrequent == drop_idx )[0 ]
684+
685+ drop_idx_ .append (orig_drop_idx )
686+
687+ self .drop_idx_ = np .asarray (drop_idx_ , dtype = object )
651688
652689 def _identify_infrequent (self , category_count , n_samples , col_idx ):
653690 """Compute the infrequent indices.
@@ -809,16 +846,19 @@ def _compute_transformed_categories(self, i, remove_dropped=True):
809846
810847 def _remove_dropped_categories (self , categories , i ):
811848 """Remove dropped categories."""
812- if self .drop_idx_ is not None and self .drop_idx_ [i ] is not None :
813- return np .delete (categories , self .drop_idx_ [i ])
849+ if (
850+ self ._drop_idx_after_grouping is not None
851+ and self ._drop_idx_after_grouping [i ] is not None
852+ ):
853+ return np .delete (categories , self ._drop_idx_after_grouping [i ])
814854 return categories
815855
816856 def _compute_n_features_outs (self ):
817857 """Compute the n_features_out for each input feature."""
818858 output = [len (cats ) for cats in self .categories_ ]
819859
820- if self .drop_idx_ is not None :
821- for i , drop_idx in enumerate (self .drop_idx_ ):
860+ if self ._drop_idx_after_grouping is not None :
861+ for i , drop_idx in enumerate (self ._drop_idx_after_grouping ):
822862 if drop_idx is not None :
823863 output [i ] -= 1
824864
@@ -875,7 +915,7 @@ def fit(self, X, y=None):
875915 self ._fit_infrequent_category_mapping (
876916 fit_results ["n_samples" ], fit_results ["category_counts" ]
877917 )
878- self .drop_idx_ = self . _compute_drop_idx ()
918+ self ._set_drop_idx ()
879919 self ._n_features_outs = self ._compute_n_features_outs ()
880920 return self
881921
@@ -914,8 +954,8 @@ def transform(self, X):
914954
915955 n_samples , n_features = X_int .shape
916956
917- if self .drop_idx_ is not None :
918- to_drop = self .drop_idx_ .copy ()
957+ if self ._drop_idx_after_grouping is not None :
958+ to_drop = self ._drop_idx_after_grouping .copy ()
919959 # We remove all the dropped categories from mask, and decrement all
920960 # categories that occur after them to avoid an empty column.
921961 keep_cells = X_int != to_drop
@@ -1014,7 +1054,7 @@ def inverse_transform(self, X):
10141054 # category. In this case we just fill the column with this
10151055 # unique category value.
10161056 if n_categories == 0 :
1017- X_tr [:, i ] = self .categories_ [i ][self .drop_idx_ [i ]]
1057+ X_tr [:, i ] = self .categories_ [i ][self ._drop_idx_after_grouping [i ]]
10181058 j += n_categories
10191059 continue
10201060 sub = X [:, j : j + n_categories ]
@@ -1031,14 +1071,19 @@ def inverse_transform(self, X):
10311071 if unknown .any ():
10321072 # if categories were dropped then unknown categories will
10331073 # be mapped to the dropped category
1034- if self .drop_idx_ is None or self .drop_idx_ [i ] is None :
1074+ if (
1075+ self ._drop_idx_after_grouping is None
1076+ or self ._drop_idx_after_grouping [i ] is None
1077+ ):
10351078 found_unknown [i ] = unknown
10361079 else :
1037- X_tr [unknown , i ] = self .categories_ [i ][self .drop_idx_ [i ]]
1080+ X_tr [unknown , i ] = self .categories_ [i ][
1081+ self ._drop_idx_after_grouping [i ]
1082+ ]
10381083 else :
10391084 dropped = np .asarray (sub .sum (axis = 1 ) == 0 ).flatten ()
10401085 if dropped .any ():
1041- if self .drop_idx_ is None :
1086+ if self ._drop_idx_after_grouping is None :
10421087 all_zero_samples = np .flatnonzero (dropped )
10431088 raise ValueError (
10441089 f"Samples { all_zero_samples } can not be inverted "
@@ -1047,7 +1092,7 @@ def inverse_transform(self, X):
10471092 )
10481093 # we can safely assume that all of the nulls in each column
10491094 # are the dropped value
1050- drop_idx = self .drop_idx_ [i ]
1095+ drop_idx = self ._drop_idx_after_grouping [i ]
10511096 X_tr [dropped , i ] = transformed_features [i ][drop_idx ]
10521097
10531098 j += n_categories
0 commit comments