Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1380e3c

Browse files
thomasjpfanglemaitrejeremiedbbjjerphan
authored
FIX Fixes bug OneHotEncoder's drop_idx_ when there are infrequent categories (#25589)
Co-authored-by: Guillaume Lemaitre <[email protected]> Co-authored-by: Jérémie du Boisberranger <[email protected]> Co-authored-by: Julien Jerphanion <[email protected]>
1 parent 451f212 commit 1380e3c

File tree

3 files changed

+108
-23
lines changed

3 files changed

+108
-23
lines changed

doc/whats_new/v1.2.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ Changelog
8484
:mod:`sklearn.preprocessing`
8585
............................
8686

87+
- |Fix| :attr:`preprocessing.OneHotEncoder.drop_idx_` now properly
88+
references the dropped category in the `categories_` attribute
89+
when there are infrequent categories. :pr:`25589` by `Thomas Fan`_.
90+
8791
- |Fix| :class:`preprocessing.OrdinalEncoder` now correctly supports
8892
`encoded_missing_value` or `unknown_value` set to a categories' cardinality
8993
when there is missing values in the training data. :pr:`25704` by `Thomas Fan`_.

sklearn/preprocessing/_encoders.py

Lines changed: 67 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,10 @@ class OneHotEncoder(_BaseEncoder):
270270
- array : ``drop[i]`` is the category in feature ``X[:, i]`` that
271271
should be dropped.
272272
273+
When `max_categories` or `min_frequency` is configured to group
274+
infrequent categories, the dropping behavior is handled after the
275+
grouping.
276+
273277
.. versionadded:: 0.21
274278
The parameter `drop` was added in 0.21.
275279
@@ -544,7 +548,7 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
544548
"""Convert `drop_idx` into the index for infrequent categories.
545549
546550
If there are no infrequent categories, then `drop_idx` is
547-
returned. This method is called in `_compute_drop_idx` when the `drop`
551+
returned. This method is called in `_set_drop_idx` when the `drop`
548552
parameter is an array-like.
549553
"""
550554
if not self._infrequent_enabled:
@@ -564,24 +568,35 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
564568
)
565569
return default_to_infrequent[drop_idx]
566570

567-
def _compute_drop_idx(self):
571+
def _set_drop_idx(self):
568572
"""Compute the drop indices associated with `self.categories_`.
569573
570574
If `self.drop` is:
571-
- `None`, returns `None`.
572-
- `'first'`, returns all zeros to drop the first category.
573-
- `'if_binary'`, returns zero if the category is binary and `None`
575+
- `None`, No categories have been dropped.
576+
- `'first'`, All zeros to drop the first category.
577+
- `'if_binary'`, All zeros if the category is binary and `None`
574578
otherwise.
575-
- array-like, returns the indices of the categories that match the
579+
- array-like, The indices of the categories that match the
576580
categories in `self.drop`. If the dropped category is an infrequent
577581
category, then the index for the infrequent category is used. This
578582
means that the entire infrequent category is dropped.
583+
584+
This methods defines a public `drop_idx_` and a private
585+
`_drop_idx_after_grouping`.
586+
587+
- `drop_idx_`: Public facing API that references the drop category in
588+
`self.categories_`.
589+
- `_drop_idx_after_grouping`: Used internally to drop categories *after* the
590+
infrequent categories are grouped together.
591+
592+
If there are no infrequent categories or drop is `None`, then
593+
`drop_idx_=_drop_idx_after_grouping`.
579594
"""
580595
if self.drop is None:
581-
return None
596+
drop_idx_after_grouping = None
582597
elif isinstance(self.drop, str):
583598
if self.drop == "first":
584-
return np.zeros(len(self.categories_), dtype=object)
599+
drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)
585600
elif self.drop == "if_binary":
586601
n_features_out_no_drop = [len(cat) for cat in self.categories_]
587602
if self._infrequent_enabled:
@@ -590,7 +605,7 @@ def _compute_drop_idx(self):
590605
continue
591606
n_features_out_no_drop[i] -= infreq_idx.size - 1
592607

593-
return np.array(
608+
drop_idx_after_grouping = np.array(
594609
[
595610
0 if n_features_out == 2 else None
596611
for n_features_out in n_features_out_no_drop
@@ -647,7 +662,29 @@ def _compute_drop_idx(self):
647662
)
648663
)
649664
raise ValueError(msg)
650-
return np.array(drop_indices, dtype=object)
665+
drop_idx_after_grouping = np.array(drop_indices, dtype=object)
666+
667+
# `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
668+
# categories are grouped together. If needed, we remap `drop_idx` back
669+
# to the categories seen in `self.categories_`.
670+
self._drop_idx_after_grouping = drop_idx_after_grouping
671+
672+
if not self._infrequent_enabled or drop_idx_after_grouping is None:
673+
self.drop_idx_ = self._drop_idx_after_grouping
674+
else:
675+
drop_idx_ = []
676+
for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):
677+
default_to_infrequent = self._default_to_infrequent_mappings[
678+
feature_idx
679+
]
680+
if drop_idx is None or default_to_infrequent is None:
681+
orig_drop_idx = drop_idx
682+
else:
683+
orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]
684+
685+
drop_idx_.append(orig_drop_idx)
686+
687+
self.drop_idx_ = np.asarray(drop_idx_, dtype=object)
651688

652689
def _identify_infrequent(self, category_count, n_samples, col_idx):
653690
"""Compute the infrequent indices.
@@ -809,16 +846,19 @@ def _compute_transformed_categories(self, i, remove_dropped=True):
809846

810847
def _remove_dropped_categories(self, categories, i):
811848
"""Remove dropped categories."""
812-
if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
813-
return np.delete(categories, self.drop_idx_[i])
849+
if (
850+
self._drop_idx_after_grouping is not None
851+
and self._drop_idx_after_grouping[i] is not None
852+
):
853+
return np.delete(categories, self._drop_idx_after_grouping[i])
814854
return categories
815855

816856
def _compute_n_features_outs(self):
817857
"""Compute the n_features_out for each input feature."""
818858
output = [len(cats) for cats in self.categories_]
819859

820-
if self.drop_idx_ is not None:
821-
for i, drop_idx in enumerate(self.drop_idx_):
860+
if self._drop_idx_after_grouping is not None:
861+
for i, drop_idx in enumerate(self._drop_idx_after_grouping):
822862
if drop_idx is not None:
823863
output[i] -= 1
824864

@@ -875,7 +915,7 @@ def fit(self, X, y=None):
875915
self._fit_infrequent_category_mapping(
876916
fit_results["n_samples"], fit_results["category_counts"]
877917
)
878-
self.drop_idx_ = self._compute_drop_idx()
918+
self._set_drop_idx()
879919
self._n_features_outs = self._compute_n_features_outs()
880920
return self
881921

@@ -914,8 +954,8 @@ def transform(self, X):
914954

915955
n_samples, n_features = X_int.shape
916956

917-
if self.drop_idx_ is not None:
918-
to_drop = self.drop_idx_.copy()
957+
if self._drop_idx_after_grouping is not None:
958+
to_drop = self._drop_idx_after_grouping.copy()
919959
# We remove all the dropped categories from mask, and decrement all
920960
# categories that occur after them to avoid an empty column.
921961
keep_cells = X_int != to_drop
@@ -1014,7 +1054,7 @@ def inverse_transform(self, X):
10141054
# category. In this case we just fill the column with this
10151055
# unique category value.
10161056
if n_categories == 0:
1017-
X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]
1057+
X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]
10181058
j += n_categories
10191059
continue
10201060
sub = X[:, j : j + n_categories]
@@ -1031,14 +1071,19 @@ def inverse_transform(self, X):
10311071
if unknown.any():
10321072
# if categories were dropped then unknown categories will
10331073
# be mapped to the dropped category
1034-
if self.drop_idx_ is None or self.drop_idx_[i] is None:
1074+
if (
1075+
self._drop_idx_after_grouping is None
1076+
or self._drop_idx_after_grouping[i] is None
1077+
):
10351078
found_unknown[i] = unknown
10361079
else:
1037-
X_tr[unknown, i] = self.categories_[i][self.drop_idx_[i]]
1080+
X_tr[unknown, i] = self.categories_[i][
1081+
self._drop_idx_after_grouping[i]
1082+
]
10381083
else:
10391084
dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
10401085
if dropped.any():
1041-
if self.drop_idx_ is None:
1086+
if self._drop_idx_after_grouping is None:
10421087
all_zero_samples = np.flatnonzero(dropped)
10431088
raise ValueError(
10441089
f"Samples {all_zero_samples} can not be inverted "
@@ -1047,7 +1092,7 @@ def inverse_transform(self, X):
10471092
)
10481093
# we can safely assume that all of the nulls in each column
10491094
# are the dropped value
1050-
drop_idx = self.drop_idx_[i]
1095+
drop_idx = self._drop_idx_after_grouping[i]
10511096
X_tr[dropped, i] = transformed_features[i][drop_idx]
10521097

10531098
j += n_categories

sklearn/preprocessing/tests/test_encoders.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -929,7 +929,7 @@ def test_ohe_infrequent_two_levels_drop_frequent(drop):
929929
max_categories=2,
930930
drop=drop,
931931
).fit(X_train)
932-
assert_array_equal(ohe.drop_idx_, [0])
932+
assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
933933

934934
X_test = np.array([["b"], ["c"]])
935935
X_trans = ohe.transform(X_test)
@@ -2015,3 +2015,39 @@ def test_ordinal_encoder_missing_unknown_encoding_max():
20152015
X_test = np.array([["snake"]])
20162016
X_trans = enc.transform(X_test)
20172017
assert_allclose(X_trans, [[2]])
2018+
2019+
2020+
def test_drop_idx_infrequent_categories():
2021+
"""Check drop_idx is defined correctly with infrequent categories.
2022+
2023+
Non-regression test for gh-25550.
2024+
"""
2025+
X = np.array(
2026+
[["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
2027+
).T
2028+
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X)
2029+
assert_array_equal(
2030+
ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"]
2031+
)
2032+
assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
2033+
2034+
X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T
2035+
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X)
2036+
assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"])
2037+
assert ohe.categories_[0][ohe.drop_idx_[0]] == "c"
2038+
2039+
X = np.array(
2040+
[["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
2041+
).T
2042+
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X)
2043+
assert_array_equal(
2044+
ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"]
2045+
)
2046+
assert ohe.categories_[0][ohe.drop_idx_[0]] == "d"
2047+
2048+
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X)
2049+
assert_array_equal(
2050+
ohe.get_feature_names_out(),
2051+
["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"],
2052+
)
2053+
assert ohe.drop_idx_ is None

0 commit comments

Comments
 (0)