Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b000feb

Browse files
thomasjpfanglemaitrejeremiedbbjjerphan
committed
FIX Fixes bug OneHotEncoder's drop_idx_ when there are infrequent categories (scikit-learn#25589)
Co-authored-by: Guillaume Lemaitre <[email protected]> Co-authored-by: Jérémie du Boisberranger <[email protected]> Co-authored-by: Julien Jerphanion <[email protected]>
1 parent 75c1bdf commit b000feb

File tree

3 files changed

+108
-23
lines changed

3 files changed

+108
-23
lines changed

doc/whats_new/v1.2.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ Changelog
8484
:mod:`sklearn.preprocessing`
8585
............................
8686

87+
- |Fix| :attr:`preprocessing.OneHotEncoder.drop_idx_` now properly
88+
references the dropped category in the `categories_` attribute
89+
when there are infrequent categories. :pr:`25589` by `Thomas Fan`_.
90+
8791
- |Fix| :class:`preprocessing.OrdinalEncoder` now correctly supports
8892
`encoded_missing_value` or `unknown_value` set to a categories' cardinality
8993
when there is missing values in the training data. :pr:`25704` by `Thomas Fan`_.

sklearn/preprocessing/_encoders.py

Lines changed: 67 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,10 @@ class OneHotEncoder(_BaseEncoder):
270270
- array : ``drop[i]`` is the category in feature ``X[:, i]`` that
271271
should be dropped.
272272
273+
When `max_categories` or `min_frequency` is configured to group
274+
infrequent categories, the dropping behavior is handled after the
275+
grouping.
276+
273277
.. versionadded:: 0.21
274278
The parameter `drop` was added in 0.21.
275279
@@ -514,7 +518,7 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
514518
"""Convert `drop_idx` into the index for infrequent categories.
515519
516520
If there are no infrequent categories, then `drop_idx` is
517-
returned. This method is called in `_compute_drop_idx` when the `drop`
521+
returned. This method is called in `_set_drop_idx` when the `drop`
518522
parameter is an array-like.
519523
"""
520524
if not self._infrequent_enabled:
@@ -534,24 +538,35 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
534538
)
535539
return default_to_infrequent[drop_idx]
536540

537-
def _compute_drop_idx(self):
541+
def _set_drop_idx(self):
538542
"""Compute the drop indices associated with `self.categories_`.
539543
540544
If `self.drop` is:
541-
- `None`, returns `None`.
542-
- `'first'`, returns all zeros to drop the first category.
543-
- `'if_binary'`, returns zero if the category is binary and `None`
545+
- `None`, No categories have been dropped.
546+
- `'first'`, All zeros to drop the first category.
547+
- `'if_binary'`, All zeros if the category is binary and `None`
544548
otherwise.
545-
- array-like, returns the indices of the categories that match the
549+
- array-like, The indices of the categories that match the
546550
categories in `self.drop`. If the dropped category is an infrequent
547551
category, then the index for the infrequent category is used. This
548552
means that the entire infrequent category is dropped.
553+
554+
This methods defines a public `drop_idx_` and a private
555+
`_drop_idx_after_grouping`.
556+
557+
- `drop_idx_`: Public facing API that references the drop category in
558+
`self.categories_`.
559+
- `_drop_idx_after_grouping`: Used internally to drop categories *after* the
560+
infrequent categories are grouped together.
561+
562+
If there are no infrequent categories or drop is `None`, then
563+
`drop_idx_=_drop_idx_after_grouping`.
549564
"""
550565
if self.drop is None:
551-
return None
566+
drop_idx_after_grouping = None
552567
elif isinstance(self.drop, str):
553568
if self.drop == "first":
554-
return np.zeros(len(self.categories_), dtype=object)
569+
drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)
555570
elif self.drop == "if_binary":
556571
n_features_out_no_drop = [len(cat) for cat in self.categories_]
557572
if self._infrequent_enabled:
@@ -560,7 +575,7 @@ def _compute_drop_idx(self):
560575
continue
561576
n_features_out_no_drop[i] -= infreq_idx.size - 1
562577

563-
return np.array(
578+
drop_idx_after_grouping = np.array(
564579
[
565580
0 if n_features_out == 2 else None
566581
for n_features_out in n_features_out_no_drop
@@ -617,7 +632,29 @@ def _compute_drop_idx(self):
617632
)
618633
)
619634
raise ValueError(msg)
620-
return np.array(drop_indices, dtype=object)
635+
drop_idx_after_grouping = np.array(drop_indices, dtype=object)
636+
637+
# `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
638+
# categories are grouped together. If needed, we remap `drop_idx` back
639+
# to the categories seen in `self.categories_`.
640+
self._drop_idx_after_grouping = drop_idx_after_grouping
641+
642+
if not self._infrequent_enabled or drop_idx_after_grouping is None:
643+
self.drop_idx_ = self._drop_idx_after_grouping
644+
else:
645+
drop_idx_ = []
646+
for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):
647+
default_to_infrequent = self._default_to_infrequent_mappings[
648+
feature_idx
649+
]
650+
if drop_idx is None or default_to_infrequent is None:
651+
orig_drop_idx = drop_idx
652+
else:
653+
orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]
654+
655+
drop_idx_.append(orig_drop_idx)
656+
657+
self.drop_idx_ = np.asarray(drop_idx_, dtype=object)
621658

622659
def _identify_infrequent(self, category_count, n_samples, col_idx):
623660
"""Compute the infrequent indices.
@@ -779,16 +816,19 @@ def _compute_transformed_categories(self, i, remove_dropped=True):
779816

780817
def _remove_dropped_categories(self, categories, i):
781818
"""Remove dropped categories."""
782-
if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
783-
return np.delete(categories, self.drop_idx_[i])
819+
if (
820+
self._drop_idx_after_grouping is not None
821+
and self._drop_idx_after_grouping[i] is not None
822+
):
823+
return np.delete(categories, self._drop_idx_after_grouping[i])
784824
return categories
785825

786826
def _compute_n_features_outs(self):
787827
"""Compute the n_features_out for each input feature."""
788828
output = [len(cats) for cats in self.categories_]
789829

790-
if self.drop_idx_ is not None:
791-
for i, drop_idx in enumerate(self.drop_idx_):
830+
if self._drop_idx_after_grouping is not None:
831+
for i, drop_idx in enumerate(self._drop_idx_after_grouping):
792832
if drop_idx is not None:
793833
output[i] -= 1
794834

@@ -845,7 +885,7 @@ def fit(self, X, y=None):
845885
self._fit_infrequent_category_mapping(
846886
fit_results["n_samples"], fit_results["category_counts"]
847887
)
848-
self.drop_idx_ = self._compute_drop_idx()
888+
self._set_drop_idx()
849889
self._n_features_outs = self._compute_n_features_outs()
850890
return self
851891

@@ -884,8 +924,8 @@ def transform(self, X):
884924

885925
n_samples, n_features = X_int.shape
886926

887-
if self.drop_idx_ is not None:
888-
to_drop = self.drop_idx_.copy()
927+
if self._drop_idx_after_grouping is not None:
928+
to_drop = self._drop_idx_after_grouping.copy()
889929
# We remove all the dropped categories from mask, and decrement all
890930
# categories that occur after them to avoid an empty column.
891931
keep_cells = X_int != to_drop
@@ -984,7 +1024,7 @@ def inverse_transform(self, X):
9841024
# category. In this case we just fill the column with this
9851025
# unique category value.
9861026
if n_categories == 0:
987-
X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]
1027+
X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]
9881028
j += n_categories
9891029
continue
9901030
sub = X[:, j : j + n_categories]
@@ -1001,14 +1041,19 @@ def inverse_transform(self, X):
10011041
if unknown.any():
10021042
# if categories were dropped then unknown categories will
10031043
# be mapped to the dropped category
1004-
if self.drop_idx_ is None or self.drop_idx_[i] is None:
1044+
if (
1045+
self._drop_idx_after_grouping is None
1046+
or self._drop_idx_after_grouping[i] is None
1047+
):
10051048
found_unknown[i] = unknown
10061049
else:
1007-
X_tr[unknown, i] = self.categories_[i][self.drop_idx_[i]]
1050+
X_tr[unknown, i] = self.categories_[i][
1051+
self._drop_idx_after_grouping[i]
1052+
]
10081053
else:
10091054
dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
10101055
if dropped.any():
1011-
if self.drop_idx_ is None:
1056+
if self._drop_idx_after_grouping is None:
10121057
all_zero_samples = np.flatnonzero(dropped)
10131058
raise ValueError(
10141059
f"Samples {all_zero_samples} can not be inverted "
@@ -1017,7 +1062,7 @@ def inverse_transform(self, X):
10171062
)
10181063
# we can safely assume that all of the nulls in each column
10191064
# are the dropped value
1020-
drop_idx = self.drop_idx_[i]
1065+
drop_idx = self._drop_idx_after_grouping[i]
10211066
X_tr[dropped, i] = transformed_features[i][drop_idx]
10221067

10231068
j += n_categories

sklearn/preprocessing/tests/test_encoders.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -903,7 +903,7 @@ def test_ohe_infrequent_two_levels_drop_frequent(drop):
903903
max_categories=2,
904904
drop=drop,
905905
).fit(X_train)
906-
assert_array_equal(ohe.drop_idx_, [0])
906+
assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
907907

908908
X_test = np.array([["b"], ["c"]])
909909
X_trans = ohe.transform(X_test)
@@ -1989,3 +1989,39 @@ def test_ordinal_encoder_missing_unknown_encoding_max():
19891989
X_test = np.array([["snake"]])
19901990
X_trans = enc.transform(X_test)
19911991
assert_allclose(X_trans, [[2]])
1992+
1993+
1994+
def test_drop_idx_infrequent_categories():
1995+
"""Check drop_idx is defined correctly with infrequent categories.
1996+
1997+
Non-regression test for gh-25550.
1998+
"""
1999+
X = np.array(
2000+
[["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
2001+
).T
2002+
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X)
2003+
assert_array_equal(
2004+
ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"]
2005+
)
2006+
assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
2007+
2008+
X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T
2009+
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X)
2010+
assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"])
2011+
assert ohe.categories_[0][ohe.drop_idx_[0]] == "c"
2012+
2013+
X = np.array(
2014+
[["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
2015+
).T
2016+
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X)
2017+
assert_array_equal(
2018+
ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"]
2019+
)
2020+
assert ohe.categories_[0][ohe.drop_idx_[0]] == "d"
2021+
2022+
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X)
2023+
assert_array_equal(
2024+
ohe.get_feature_names_out(),
2025+
["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"],
2026+
)
2027+
assert ohe.drop_idx_ is None

0 commit comments

Comments
 (0)