@@ -270,6 +270,10 @@ class OneHotEncoder(_BaseEncoder):
270270 - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
271271 should be dropped.
272272
273+ When `max_categories` or `min_frequency` is configured to group
274+ infrequent categories, the dropping behavior is handled after the
275+ grouping.
276+
273277 .. versionadded:: 0.21
274278 The parameter `drop` was added in 0.21.
275279
@@ -514,7 +518,7 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
514518 """Convert `drop_idx` into the index for infrequent categories.
515519
516520 If there are no infrequent categories, then `drop_idx` is
517- returned. This method is called in `_compute_drop_idx ` when the `drop`
521+ returned. This method is called in `_set_drop_idx ` when the `drop`
518522 parameter is an array-like.
519523 """
520524 if not self ._infrequent_enabled :
@@ -534,24 +538,35 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
534538 )
535539 return default_to_infrequent [drop_idx ]
536540
537- def _compute_drop_idx (self ):
541+ def _set_drop_idx (self ):
538542 """Compute the drop indices associated with `self.categories_`.
539543
540544 If `self.drop` is:
541- - `None`, returns `None` .
542- - `'first'`, returns all zeros to drop the first category.
543- - `'if_binary'`, returns zero if the category is binary and `None`
545+ - `None`, No categories have been dropped .
546+ - `'first'`, All zeros to drop the first category.
547+ - `'if_binary'`, All zeros if the category is binary and `None`
544548 otherwise.
545- - array-like, returns the indices of the categories that match the
549+ - array-like, The indices of the categories that match the
546550 categories in `self.drop`. If the dropped category is an infrequent
547551 category, then the index for the infrequent category is used. This
548552 means that the entire infrequent category is dropped.
553+
554+ This methods defines a public `drop_idx_` and a private
555+ `_drop_idx_after_grouping`.
556+
557+ - `drop_idx_`: Public facing API that references the drop category in
558+ `self.categories_`.
559+ - `_drop_idx_after_grouping`: Used internally to drop categories *after* the
560+ infrequent categories are grouped together.
561+
562+ If there are no infrequent categories or drop is `None`, then
563+ `drop_idx_=_drop_idx_after_grouping`.
549564 """
550565 if self .drop is None :
551- return None
566+ drop_idx_after_grouping = None
552567 elif isinstance (self .drop , str ):
553568 if self .drop == "first" :
554- return np .zeros (len (self .categories_ ), dtype = object )
569+ drop_idx_after_grouping = np .zeros (len (self .categories_ ), dtype = object )
555570 elif self .drop == "if_binary" :
556571 n_features_out_no_drop = [len (cat ) for cat in self .categories_ ]
557572 if self ._infrequent_enabled :
@@ -560,7 +575,7 @@ def _compute_drop_idx(self):
560575 continue
561576 n_features_out_no_drop [i ] -= infreq_idx .size - 1
562577
563- return np .array (
578+ drop_idx_after_grouping = np .array (
564579 [
565580 0 if n_features_out == 2 else None
566581 for n_features_out in n_features_out_no_drop
@@ -617,7 +632,29 @@ def _compute_drop_idx(self):
617632 )
618633 )
619634 raise ValueError (msg )
620- return np .array (drop_indices , dtype = object )
635+ drop_idx_after_grouping = np .array (drop_indices , dtype = object )
636+
637+ # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
638+ # categories are grouped together. If needed, we remap `drop_idx` back
639+ # to the categories seen in `self.categories_`.
640+ self ._drop_idx_after_grouping = drop_idx_after_grouping
641+
642+ if not self ._infrequent_enabled or drop_idx_after_grouping is None :
643+ self .drop_idx_ = self ._drop_idx_after_grouping
644+ else :
645+ drop_idx_ = []
646+ for feature_idx , drop_idx in enumerate (drop_idx_after_grouping ):
647+ default_to_infrequent = self ._default_to_infrequent_mappings [
648+ feature_idx
649+ ]
650+ if drop_idx is None or default_to_infrequent is None :
651+ orig_drop_idx = drop_idx
652+ else :
653+ orig_drop_idx = np .flatnonzero (default_to_infrequent == drop_idx )[0 ]
654+
655+ drop_idx_ .append (orig_drop_idx )
656+
657+ self .drop_idx_ = np .asarray (drop_idx_ , dtype = object )
621658
622659 def _identify_infrequent (self , category_count , n_samples , col_idx ):
623660 """Compute the infrequent indices.
@@ -779,16 +816,19 @@ def _compute_transformed_categories(self, i, remove_dropped=True):
779816
780817 def _remove_dropped_categories (self , categories , i ):
781818 """Remove dropped categories."""
782- if self .drop_idx_ is not None and self .drop_idx_ [i ] is not None :
783- return np .delete (categories , self .drop_idx_ [i ])
819+ if (
820+ self ._drop_idx_after_grouping is not None
821+ and self ._drop_idx_after_grouping [i ] is not None
822+ ):
823+ return np .delete (categories , self ._drop_idx_after_grouping [i ])
784824 return categories
785825
786826 def _compute_n_features_outs (self ):
787827 """Compute the n_features_out for each input feature."""
788828 output = [len (cats ) for cats in self .categories_ ]
789829
790- if self .drop_idx_ is not None :
791- for i , drop_idx in enumerate (self .drop_idx_ ):
830+ if self ._drop_idx_after_grouping is not None :
831+ for i , drop_idx in enumerate (self ._drop_idx_after_grouping ):
792832 if drop_idx is not None :
793833 output [i ] -= 1
794834
@@ -845,7 +885,7 @@ def fit(self, X, y=None):
845885 self ._fit_infrequent_category_mapping (
846886 fit_results ["n_samples" ], fit_results ["category_counts" ]
847887 )
848- self .drop_idx_ = self . _compute_drop_idx ()
888+ self ._set_drop_idx ()
849889 self ._n_features_outs = self ._compute_n_features_outs ()
850890 return self
851891
@@ -884,8 +924,8 @@ def transform(self, X):
884924
885925 n_samples , n_features = X_int .shape
886926
887- if self .drop_idx_ is not None :
888- to_drop = self .drop_idx_ .copy ()
927+ if self ._drop_idx_after_grouping is not None :
928+ to_drop = self ._drop_idx_after_grouping .copy ()
889929 # We remove all the dropped categories from mask, and decrement all
890930 # categories that occur after them to avoid an empty column.
891931 keep_cells = X_int != to_drop
@@ -984,7 +1024,7 @@ def inverse_transform(self, X):
9841024 # category. In this case we just fill the column with this
9851025 # unique category value.
9861026 if n_categories == 0 :
987- X_tr [:, i ] = self .categories_ [i ][self .drop_idx_ [i ]]
1027+ X_tr [:, i ] = self .categories_ [i ][self ._drop_idx_after_grouping [i ]]
9881028 j += n_categories
9891029 continue
9901030 sub = X [:, j : j + n_categories ]
@@ -1001,14 +1041,19 @@ def inverse_transform(self, X):
10011041 if unknown .any ():
10021042 # if categories were dropped then unknown categories will
10031043 # be mapped to the dropped category
1004- if self .drop_idx_ is None or self .drop_idx_ [i ] is None :
1044+ if (
1045+ self ._drop_idx_after_grouping is None
1046+ or self ._drop_idx_after_grouping [i ] is None
1047+ ):
10051048 found_unknown [i ] = unknown
10061049 else :
1007- X_tr [unknown , i ] = self .categories_ [i ][self .drop_idx_ [i ]]
1050+ X_tr [unknown , i ] = self .categories_ [i ][
1051+ self ._drop_idx_after_grouping [i ]
1052+ ]
10081053 else :
10091054 dropped = np .asarray (sub .sum (axis = 1 ) == 0 ).flatten ()
10101055 if dropped .any ():
1011- if self .drop_idx_ is None :
1056+ if self ._drop_idx_after_grouping is None :
10121057 all_zero_samples = np .flatnonzero (dropped )
10131058 raise ValueError (
10141059 f"Samples { all_zero_samples } can not be inverted "
@@ -1017,7 +1062,7 @@ def inverse_transform(self, X):
10171062 )
10181063 # we can safely assume that all of the nulls in each column
10191064 # are the dropped value
1020- drop_idx = self .drop_idx_ [i ]
1065+ drop_idx = self ._drop_idx_after_grouping [i ]
10211066 X_tr [dropped , i ] = transformed_features [i ][drop_idx ]
10221067
10231068 j += n_categories
0 commit comments