diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index cbe013bf1152d..941ba79c409f4 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -84,6 +84,10 @@ Changelog :mod:`sklearn.preprocessing` ............................ +- |Fix| :attr:`preprocessing.OneHotEncoder.drop_idx_` now properly + references the dropped category in the `categories_` attribute + when there are infrequent categories. :pr:`25589` by `Thomas Fan`_. + - |Fix| :class:`preprocessing.OrdinalEncoder` now correctly supports `encoded_missing_value` or `unknown_value` set to a categories' cardinality when there is missing values in the training data. :pr:`25704` by `Thomas Fan`_. diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index ec1bbeea62448..4c3e80771c35a 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -270,6 +270,10 @@ class OneHotEncoder(_BaseEncoder): - array : ``drop[i]`` is the category in feature ``X[:, i]`` that should be dropped. + When `max_categories` or `min_frequency` is configured to group + infrequent categories, the dropping behavior is handled after the + grouping. + .. versionadded:: 0.21 The parameter `drop` was added in 0.21. @@ -544,7 +548,7 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx): """Convert `drop_idx` into the index for infrequent categories. If there are no infrequent categories, then `drop_idx` is - returned. This method is called in `_compute_drop_idx` when the `drop` + returned. This method is called in `_set_drop_idx` when the `drop` parameter is an array-like. """ if not self._infrequent_enabled: @@ -564,24 +568,35 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx): ) return default_to_infrequent[drop_idx] - def _compute_drop_idx(self): + def _set_drop_idx(self): """Compute the drop indices associated with `self.categories_`. If `self.drop` is: - - `None`, returns `None`. - - `'first'`, returns all zeros to drop the first category. - - `'if_binary'`, returns zero if the category is binary and `None` + - `None`, No categories have been dropped. + - `'first'`, All zeros to drop the first category. + - `'if_binary'`, All zeros if the category is binary and `None` otherwise. - - array-like, returns the indices of the categories that match the + - array-like, The indices of the categories that match the categories in `self.drop`. If the dropped category is an infrequent category, then the index for the infrequent category is used. This means that the entire infrequent category is dropped. + + This methods defines a public `drop_idx_` and a private + `_drop_idx_after_grouping`. + + - `drop_idx_`: Public facing API that references the drop category in + `self.categories_`. + - `_drop_idx_after_grouping`: Used internally to drop categories *after* the + infrequent categories are grouped together. + + If there are no infrequent categories or drop is `None`, then + `drop_idx_=_drop_idx_after_grouping`. """ if self.drop is None: - return None + drop_idx_after_grouping = None elif isinstance(self.drop, str): if self.drop == "first": - return np.zeros(len(self.categories_), dtype=object) + drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object) elif self.drop == "if_binary": n_features_out_no_drop = [len(cat) for cat in self.categories_] if self._infrequent_enabled: @@ -590,7 +605,7 @@ def _compute_drop_idx(self): continue n_features_out_no_drop[i] -= infreq_idx.size - 1 - return np.array( + drop_idx_after_grouping = np.array( [ 0 if n_features_out == 2 else None for n_features_out in n_features_out_no_drop @@ -647,7 +662,29 @@ def _compute_drop_idx(self): ) ) raise ValueError(msg) - return np.array(drop_indices, dtype=object) + drop_idx_after_grouping = np.array(drop_indices, dtype=object) + + # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent + # categories are grouped together. If needed, we remap `drop_idx` back + # to the categories seen in `self.categories_`. + self._drop_idx_after_grouping = drop_idx_after_grouping + + if not self._infrequent_enabled or drop_idx_after_grouping is None: + self.drop_idx_ = self._drop_idx_after_grouping + else: + drop_idx_ = [] + for feature_idx, drop_idx in enumerate(drop_idx_after_grouping): + default_to_infrequent = self._default_to_infrequent_mappings[ + feature_idx + ] + if drop_idx is None or default_to_infrequent is None: + orig_drop_idx = drop_idx + else: + orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0] + + drop_idx_.append(orig_drop_idx) + + self.drop_idx_ = np.asarray(drop_idx_, dtype=object) def _identify_infrequent(self, category_count, n_samples, col_idx): """Compute the infrequent indices. @@ -809,16 +846,19 @@ def _compute_transformed_categories(self, i, remove_dropped=True): def _remove_dropped_categories(self, categories, i): """Remove dropped categories.""" - if self.drop_idx_ is not None and self.drop_idx_[i] is not None: - return np.delete(categories, self.drop_idx_[i]) + if ( + self._drop_idx_after_grouping is not None + and self._drop_idx_after_grouping[i] is not None + ): + return np.delete(categories, self._drop_idx_after_grouping[i]) return categories def _compute_n_features_outs(self): """Compute the n_features_out for each input feature.""" output = [len(cats) for cats in self.categories_] - if self.drop_idx_ is not None: - for i, drop_idx in enumerate(self.drop_idx_): + if self._drop_idx_after_grouping is not None: + for i, drop_idx in enumerate(self._drop_idx_after_grouping): if drop_idx is not None: output[i] -= 1 @@ -875,7 +915,7 @@ def fit(self, X, y=None): self._fit_infrequent_category_mapping( fit_results["n_samples"], fit_results["category_counts"] ) - self.drop_idx_ = self._compute_drop_idx() + self._set_drop_idx() self._n_features_outs = self._compute_n_features_outs() return self @@ -914,8 +954,8 @@ def transform(self, X): n_samples, n_features = X_int.shape - if self.drop_idx_ is not None: - to_drop = self.drop_idx_.copy() + if self._drop_idx_after_grouping is not None: + to_drop = self._drop_idx_after_grouping.copy() # We remove all the dropped categories from mask, and decrement all # categories that occur after them to avoid an empty column. keep_cells = X_int != to_drop @@ -1014,7 +1054,7 @@ def inverse_transform(self, X): # category. In this case we just fill the column with this # unique category value. if n_categories == 0: - X_tr[:, i] = self.categories_[i][self.drop_idx_[i]] + X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]] j += n_categories continue sub = X[:, j : j + n_categories] @@ -1031,14 +1071,19 @@ def inverse_transform(self, X): if unknown.any(): # if categories were dropped then unknown categories will # be mapped to the dropped category - if self.drop_idx_ is None or self.drop_idx_[i] is None: + if ( + self._drop_idx_after_grouping is None + or self._drop_idx_after_grouping[i] is None + ): found_unknown[i] = unknown else: - X_tr[unknown, i] = self.categories_[i][self.drop_idx_[i]] + X_tr[unknown, i] = self.categories_[i][ + self._drop_idx_after_grouping[i] + ] else: dropped = np.asarray(sub.sum(axis=1) == 0).flatten() if dropped.any(): - if self.drop_idx_ is None: + if self._drop_idx_after_grouping is None: all_zero_samples = np.flatnonzero(dropped) raise ValueError( f"Samples {all_zero_samples} can not be inverted " @@ -1047,7 +1092,7 @@ def inverse_transform(self, X): ) # we can safely assume that all of the nulls in each column # are the dropped value - drop_idx = self.drop_idx_[i] + drop_idx = self._drop_idx_after_grouping[i] X_tr[dropped, i] = transformed_features[i][drop_idx] j += n_categories diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 9927e7e365865..a4fea0ee92dbc 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -929,7 +929,7 @@ def test_ohe_infrequent_two_levels_drop_frequent(drop): max_categories=2, drop=drop, ).fit(X_train) - assert_array_equal(ohe.drop_idx_, [0]) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "b" X_test = np.array([["b"], ["c"]]) X_trans = ohe.transform(X_test) @@ -2015,3 +2015,39 @@ def test_ordinal_encoder_missing_unknown_encoding_max(): X_test = np.array([["snake"]]) X_trans = enc.transform(X_test) assert_allclose(X_trans, [[2]]) + + +def test_drop_idx_infrequent_categories(): + """Check drop_idx is defined correctly with infrequent categories. + + Non-regression test for gh-25550. + """ + X = np.array( + [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object + ).T + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X) + assert_array_equal( + ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"] + ) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "b" + + X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X) + assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"]) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "c" + + X = np.array( + [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object + ).T + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X) + assert_array_equal( + ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"] + ) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "d" + + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X) + assert_array_equal( + ohe.get_feature_names_out(), + ["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"], + ) + assert ohe.drop_idx_ is None