Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ Changelog
:mod:`sklearn.preprocessing`
............................

- |Fix| :attr:`preprocessing.OneHotEncoder.drop_idx_` now properly
references the dropped category in the `categories_` attribute
when there are infrequent categories. :pr:`25589` by `Thomas Fan`_.

- |Fix| :class:`preprocessing.OrdinalEncoder` now correctly supports
`encoded_missing_value` or `unknown_value` set to a categories' cardinality
when there is missing values in the training data. :pr:`25704` by `Thomas Fan`_.
Expand Down
89 changes: 67 additions & 22 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,10 @@ class OneHotEncoder(_BaseEncoder):
- array : ``drop[i]`` is the category in feature ``X[:, i]`` that
should be dropped.

When `max_categories` or `min_frequency` is configured to group
infrequent categories, the dropping behavior is handled after the
grouping.

.. versionadded:: 0.21
The parameter `drop` was added in 0.21.

Expand Down Expand Up @@ -544,7 +548,7 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
"""Convert `drop_idx` into the index for infrequent categories.

If there are no infrequent categories, then `drop_idx` is
returned. This method is called in `_compute_drop_idx` when the `drop`
returned. This method is called in `_set_drop_idx` when the `drop`
parameter is an array-like.
"""
if not self._infrequent_enabled:
Expand All @@ -564,24 +568,35 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
)
return default_to_infrequent[drop_idx]

def _compute_drop_idx(self):
def _set_drop_idx(self):
"""Compute the drop indices associated with `self.categories_`.

If `self.drop` is:
- `None`, returns `None`.
- `'first'`, returns all zeros to drop the first category.
- `'if_binary'`, returns zero if the category is binary and `None`
- `None`, No categories have been dropped.
- `'first'`, All zeros to drop the first category.
- `'if_binary'`, All zeros if the category is binary and `None`
otherwise.
- array-like, returns the indices of the categories that match the
- array-like, The indices of the categories that match the
categories in `self.drop`. If the dropped category is an infrequent
category, then the index for the infrequent category is used. This
means that the entire infrequent category is dropped.

This methods defines a public `drop_idx_` and a private
`_drop_idx_after_grouping`.

- `drop_idx_`: Public facing API that references the drop category in
`self.categories_`.
- `_drop_idx_after_grouping`: Used internally to drop categories *after* the
infrequent categories are grouped together.

If there are no infrequent categories or drop is `None`, then
`drop_idx_=_drop_idx_after_grouping`.
"""
if self.drop is None:
return None
drop_idx_after_grouping = None
elif isinstance(self.drop, str):
if self.drop == "first":
return np.zeros(len(self.categories_), dtype=object)
drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)
elif self.drop == "if_binary":
n_features_out_no_drop = [len(cat) for cat in self.categories_]
if self._infrequent_enabled:
Expand All @@ -590,7 +605,7 @@ def _compute_drop_idx(self):
continue
n_features_out_no_drop[i] -= infreq_idx.size - 1

return np.array(
drop_idx_after_grouping = np.array(
[
0 if n_features_out == 2 else None
for n_features_out in n_features_out_no_drop
Expand Down Expand Up @@ -647,7 +662,29 @@ def _compute_drop_idx(self):
)
)
raise ValueError(msg)
return np.array(drop_indices, dtype=object)
drop_idx_after_grouping = np.array(drop_indices, dtype=object)

# `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
# categories are grouped together. If needed, we remap `drop_idx` back
# to the categories seen in `self.categories_`.
self._drop_idx_after_grouping = drop_idx_after_grouping

if not self._infrequent_enabled or drop_idx_after_grouping is None:
self.drop_idx_ = self._drop_idx_after_grouping
else:
drop_idx_ = []
for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):
default_to_infrequent = self._default_to_infrequent_mappings[
feature_idx
]
if drop_idx is None or default_to_infrequent is None:
orig_drop_idx = drop_idx
else:
orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]

drop_idx_.append(orig_drop_idx)

self.drop_idx_ = np.asarray(drop_idx_, dtype=object)

def _identify_infrequent(self, category_count, n_samples, col_idx):
"""Compute the infrequent indices.
Expand Down Expand Up @@ -809,16 +846,19 @@ def _compute_transformed_categories(self, i, remove_dropped=True):

def _remove_dropped_categories(self, categories, i):
"""Remove dropped categories."""
if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
return np.delete(categories, self.drop_idx_[i])
if (
self._drop_idx_after_grouping is not None
and self._drop_idx_after_grouping[i] is not None
):
return np.delete(categories, self._drop_idx_after_grouping[i])
return categories

def _compute_n_features_outs(self):
"""Compute the n_features_out for each input feature."""
output = [len(cats) for cats in self.categories_]

if self.drop_idx_ is not None:
for i, drop_idx in enumerate(self.drop_idx_):
if self._drop_idx_after_grouping is not None:
for i, drop_idx in enumerate(self._drop_idx_after_grouping):
if drop_idx is not None:
output[i] -= 1

Expand Down Expand Up @@ -875,7 +915,7 @@ def fit(self, X, y=None):
self._fit_infrequent_category_mapping(
fit_results["n_samples"], fit_results["category_counts"]
)
self.drop_idx_ = self._compute_drop_idx()
self._set_drop_idx()
self._n_features_outs = self._compute_n_features_outs()
return self

Expand Down Expand Up @@ -914,8 +954,8 @@ def transform(self, X):

n_samples, n_features = X_int.shape

if self.drop_idx_ is not None:
to_drop = self.drop_idx_.copy()
if self._drop_idx_after_grouping is not None:
to_drop = self._drop_idx_after_grouping.copy()
# We remove all the dropped categories from mask, and decrement all
# categories that occur after them to avoid an empty column.
keep_cells = X_int != to_drop
Expand Down Expand Up @@ -1014,7 +1054,7 @@ def inverse_transform(self, X):
# category. In this case we just fill the column with this
# unique category value.
if n_categories == 0:
X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]
X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]
j += n_categories
continue
sub = X[:, j : j + n_categories]
Expand All @@ -1031,14 +1071,19 @@ def inverse_transform(self, X):
if unknown.any():
# if categories were dropped then unknown categories will
# be mapped to the dropped category
if self.drop_idx_ is None or self.drop_idx_[i] is None:
if (
self._drop_idx_after_grouping is None
or self._drop_idx_after_grouping[i] is None
):
found_unknown[i] = unknown
else:
X_tr[unknown, i] = self.categories_[i][self.drop_idx_[i]]
X_tr[unknown, i] = self.categories_[i][
self._drop_idx_after_grouping[i]
]
else:
dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
if dropped.any():
if self.drop_idx_ is None:
if self._drop_idx_after_grouping is None:
all_zero_samples = np.flatnonzero(dropped)
raise ValueError(
f"Samples {all_zero_samples} can not be inverted "
Expand All @@ -1047,7 +1092,7 @@ def inverse_transform(self, X):
)
# we can safely assume that all of the nulls in each column
# are the dropped value
drop_idx = self.drop_idx_[i]
drop_idx = self._drop_idx_after_grouping[i]
X_tr[dropped, i] = transformed_features[i][drop_idx]

j += n_categories
Expand Down
38 changes: 37 additions & 1 deletion sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,7 @@ def test_ohe_infrequent_two_levels_drop_frequent(drop):
max_categories=2,
drop=drop,
).fit(X_train)
assert_array_equal(ohe.drop_idx_, [0])
assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"

X_test = np.array([["b"], ["c"]])
X_trans = ohe.transform(X_test)
Expand Down Expand Up @@ -2015,3 +2015,39 @@ def test_ordinal_encoder_missing_unknown_encoding_max():
X_test = np.array([["snake"]])
X_trans = enc.transform(X_test)
assert_allclose(X_trans, [[2]])


def test_drop_idx_infrequent_categories():
"""Check drop_idx is defined correctly with infrequent categories.

Non-regression test for gh-25550.
"""
X = np.array(
[["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
).T
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X)
assert_array_equal(
ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"]
)
assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"

X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X)
assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"])
assert ohe.categories_[0][ohe.drop_idx_[0]] == "c"

X = np.array(
[["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
).T
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X)
assert_array_equal(
ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"]
)
assert ohe.categories_[0][ohe.drop_idx_[0]] == "d"

ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X)
assert_array_equal(
ohe.get_feature_names_out(),
["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"],
)
assert ohe.drop_idx_ is None