diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index c1d3b1e80c352..1036e7b67d4a9 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -2,6 +2,8 @@ # Joris Van den Bossche # License: BSD 3 clause +from numbers import Integral + import numpy as np from scipy import sparse @@ -10,7 +12,7 @@ from ..utils.fixes import _argmax from ..utils.validation import check_is_fitted -from .label import _encode, _encode_check_unknown +from .label import _encode, _encode_check_unknown, _encode_numpy __all__ = [ @@ -78,7 +80,15 @@ def _fit(self, X, handle_unknown='error'): raise ValueError("Shape mismatch: if categories is an array," " it has to be of shape (n_features,).") + if self.max_levels is not None: + if (not isinstance(self.max_levels, Integral) or + self.max_levels <= 0): + raise ValueError("max_levels must be None or a strictly " + "positive int, got {}.".format( + self.max_levels)) + self.categories_ = [] + self.infrequent_indices_ = [] for i in range(n_features): Xi = X_list[i] @@ -98,6 +108,18 @@ def _fit(self, X, handle_unknown='error'): raise ValueError(msg) self.categories_.append(cats) + if self.max_levels is not None: + infrequent_indices = self._find_infrequent_category_indices(Xi) + else: + infrequent_indices = np.array([]) + self.infrequent_indices_.append(infrequent_indices) + + def _find_infrequent_category_indices(self, Xi): + # TODO: this is using unique on X again. Ideally we should integrate + # this into _encode() + _, counts = np.unique(Xi, return_counts=True) + return np.argsort(counts)[:-self.max_levels] + def _transform(self, X, handle_unknown='error'): X_list, n_samples, n_features = self._check_X(X) @@ -142,6 +164,29 @@ def _transform(self, X, handle_unknown='error'): check_unknown=False) X_int[:, i] = encoded + # We need to take care of infrequent categories here. We want all the + # infrequent categories to end up in a specific column, after all the + # frequent ones. Let's say we have 4 categories with 2 infrequent + # categories (and 2 frequent categories): we want the value in X_int + # for the infrequent categories to be 2 (third and last column), and + # the values for the frequent ones to be 0 and 1. The piece of code + # below performs this mapping. + # TODO: maybe integrate this part with the one above + self._infrequent_mappings = {} + huge_int = np.iinfo(X_int.dtype).max + for feature_idx in range(n_features): + if self.infrequent_indices_[feature_idx].size > 0: + mapping = np.arange(len(self.categories_[feature_idx])) + # Trick: set the infrequent cats columns to a very big int and + # encode again. + for ordinal_cat in self.infrequent_indices_[feature_idx]: + mapping[ordinal_cat] = huge_int + _, mapping = _encode_numpy(mapping, encode=True) + + # update X_int and save mapping for later (for dropping logic) + X_int[:, feature_idx] = mapping[X_int[:, feature_idx]] + self._infrequent_mappings[feature_idx] = mapping + return X_int, X_mask def _more_tags(self): @@ -193,7 +238,11 @@ class OneHotEncoder(_BaseEncoder): - 'first' : drop the first category in each feature. If only one category is present, the feature will be dropped entirely. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that - should be dropped. + should be dropped. If ``drop[i]`` is an infrequent category, an + error is raised: it is only possible to drop all of the infrequent + categories, not just one of them. + - 'infrequent' : drop the infrequent categories column (see + ``max_levels`` parameter). sparse : boolean, default=True Will return sparse matrix if set True else will return an array. @@ -209,6 +258,10 @@ class OneHotEncoder(_BaseEncoder): will be all zeros. In the inverse transform, an unknown category will be denoted as None. + max_levels : int, default=None + Maximum number of categories to keep. Infrequent categories are + grouped together and mapped into a single column. + Attributes ---------- categories_ : list of arrays @@ -222,6 +275,10 @@ class OneHotEncoder(_BaseEncoder): be dropped for each feature. None if all the transformed features will be retained. + infrequent_indices_: list of arrays of shape(n_infrequent_categories) + ``infrequent_indices_[i]`` contains a list of indices in + ``categories_[i]`` corresponding to the infrequent categories. + Examples -------- Given a dataset with two features, we let the encoder find the unique @@ -266,12 +323,13 @@ class OneHotEncoder(_BaseEncoder): """ def __init__(self, categories='auto', drop=None, sparse=True, - dtype=np.float64, handle_unknown='error'): + dtype=np.float64, handle_unknown='error', max_levels=None): self.categories = categories self.sparse = sparse self.dtype = dtype self.handle_unknown = handle_unknown self.drop = drop + self.max_levels = max_levels def _validate_keywords(self): if self.handle_unknown not in ('error', 'ignore'): @@ -290,7 +348,8 @@ def _validate_keywords(self): def _compute_drop_idx(self): if self.drop is None: return None - elif (isinstance(self.drop, str) and self.drop == 'first'): + elif (isinstance(self.drop, str) and + self.drop in ('first', 'infrequent')): return np.zeros(len(self.categories_), dtype=np.int_) elif not isinstance(self.drop, str): try: @@ -338,6 +397,20 @@ def fit(self, X, y=None): self._validate_keywords() self._fit(X, handle_unknown=self.handle_unknown) self.drop_idx_ = self._compute_drop_idx() + # check if user wants to manually drop a feature that is + # infrequent: this is not allowed + if self.drop is not None and not isinstance(self.drop, str): + for feature_idx, (infrequent_indices, drop_idx) in enumerate( + zip(self.infrequent_indices_, self.drop_idx_)): + if drop_idx in infrequent_indices: + raise ValueError( + "Category {} of feature {} is infrequent and thus " + "cannot be dropped. Use drop='infrequent' " + "instead.".format( + self.categories_[feature_idx][drop_idx], + feature_idx + ) + ) return self def fit_transform(self, X, y=None): @@ -374,24 +447,58 @@ def transform(self, X): check_is_fitted(self, 'categories_') # validation of X happens in _check_X called by _transform X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) - n_samples, n_features = X_int.shape - if self.drop is not None: - to_drop = self.drop_idx_.reshape(1, -1) - - # We remove all the dropped categories from mask, and decrement all - # categories that occur after them to avoid an empty column. + # n_columns indicates, for each feature, how many columns are used in + # X_trans. By default this corresponds to the number of categories, but + # will differ if we drop some of them, or if there are infrequent + # categories (all mapped to the same column) + n_columns = [len(cats) for cats in self.categories_] + for feature_idx in range(n_features): + n_infrequent = self.infrequent_indices_[feature_idx].size + if n_infrequent > 0: + # still add 1 for the infrequent column + n_columns[feature_idx] += 1 - n_infrequent + if self.drop is not None: + # if drop is not None we always drop one column in general, + # except when drop is 'infrequent' and there is no infrequent + # category. + n_columns[feature_idx] -= 1 + if (isinstance(self.drop, str) and self.drop == 'infrequent' + and n_infrequent == 0): + n_columns[feature_idx] += 1 # revert decrement from above - keep_cells = X_int != to_drop + if self.drop is not None: + to_drop = self.drop_idx_.copy() + + if isinstance(self.drop, str): + if self.drop == 'infrequent': + for feature_idx in range(n_features): + if self.infrequent_indices_[feature_idx].size > 0: + # drop the infrequent column (i.e. the last one) + to_drop[feature_idx] = n_columns[feature_idx] + else: + # no infrequent category, use special marker -1 + # so that no dropping happens for this feature + to_drop[feature_idx] = -1 + else: + # self.drop is an array of categories. we need to remap the + # dropped indexes if some of the categories are infrequent. + # see _transform() for details about the mapping. + for feature_idx in range(n_features): + if self.infrequent_indices_[feature_idx].size > 0: + mapping = self._infrequent_mappings[feature_idx] + to_drop[feature_idx] = mapping[to_drop[feature_idx]] + + # We remove all the dropped categories from mask, and decrement + # all categories that occur after them to avoid an empty column. + to_drop = to_drop.reshape(1, -1) + keep_cells = (X_int != to_drop) | (to_drop == -1) X_mask &= keep_cells - X_int[X_int > to_drop] -= 1 - n_values = [len(cats) - 1 for cats in self.categories_] - else: - n_values = [len(cats) for cats in self.categories_] + X_int[(X_int > to_drop) & (to_drop != -1)] -= 1 mask = X_mask.ravel() - n_values = np.array([0] + n_values) + n_values = np.array([0] + n_columns) feature_indices = np.cumsum(n_values) indices = (X_int + feature_indices[:-1]).ravel()[mask] indptr = X_mask.sum(axis=1).cumsum() @@ -552,6 +659,10 @@ class OrdinalEncoder(_BaseEncoder): dtype : number type, default np.float64 Desired dtype of output. + max_levels : int, default=None + Maximum number of categories to keep. Infrequent categories are + grouped together and mapped to the highest int. + Attributes ---------- categories_ : list of arrays @@ -559,6 +670,10 @@ class OrdinalEncoder(_BaseEncoder): (in order of the features in X and corresponding with the output of ``transform``). + infrequent_indices_: list of arrays of shape(n_infrequent_categories) + ``infrequent_indices_[i]`` contains a list of indices in + ``categories_[i]`` corresponsing to the infrequent categories. + Examples -------- Given a dataset with two features, we let the encoder find the unique @@ -587,9 +702,10 @@ class OrdinalEncoder(_BaseEncoder): between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64): + def __init__(self, categories='auto', dtype=np.float64, max_levels=None): self.categories = categories self.dtype = dtype + self.max_levels = max_levels def fit(self, X, y=None): """Fit the OrdinalEncoder to X. diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 8e1a61781544a..08d95907f0808 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -673,6 +673,120 @@ def test_categories(density, drop): assert ohe_test.drop_idx_.dtype == np.int_ +def test_infrequent_categories_sanity(): + # Not a super legit test for now. + # Mostly aimed at explaining how the infrequent categories are handled. + + X = [[0, 0, 1], + [1, 0, 0], + [3, 5, 1], + [3, 1, 0], + [3, 1, 1], + [3, 2, 0], + [1, 5, 1], + [0, 5, 0], + [3, 0, 1]] + X = np.array(X) + + # Check _infrequent_idx_ attribute + oe = OrdinalEncoder(max_levels=2) + X_trans = oe.fit_transform(X) + # first feature: category 0 is infrequent + # note that 1 is also infrequent but we want to keep 2 categories + assert len(oe.infrequent_indices_[0]) == 1 + assert oe.categories_[0][oe.infrequent_indices_[0][0]] == 0 + # second feature: categories 2 and 1 are infrequent + # 2 comes first because it has less occurrences than 1 + assert len(oe.infrequent_indices_[1]) == 2 + assert oe.categories_[1][oe.infrequent_indices_[1][0]] == 2 + assert oe.categories_[1][oe.infrequent_indices_[1][1]] == 1 + # third feature: no infrequent category + assert len(oe.infrequent_indices_[2]) == 0 + + # For ordinal encoder, the infrequent categories are assigned the highest + # integer. + expected_X_trans = [[2, 0, 1], + [0, 0, 0], + [1, 1, 1], + [1, 2, 0], + [1, 2, 1], + [1, 2, 0], + [0, 1, 1], + [2, 1, 0], + [1, 0, 1]] + assert np.array_equal(X_trans, expected_X_trans) + + ohe = OneHotEncoder(categories='auto', max_levels=2) + X_trans = ohe.fit_transform(X).toarray() + # first feature: 1 is treated as infrequent and ends up in + # X_trans[:, 2] + # second feature: 1 and 2 are treated as infrequent and end up in + # X_trans[:, 5] + # third feature: no infrequent category. Represented by the 2 last + # columns + expected_X_trans = [[0, 0, 1, 1, 0, 0, 0, 1], + [1, 0, 0, 1, 0, 0, 1, 0], + [0, 1, 0, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 0, 1, 1, 0], + [0, 1, 0, 0, 0, 1, 0, 1], + [0, 1, 0, 0, 0, 1, 1, 0], + [1, 0, 0, 0, 1, 0, 0, 1], + [0, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 1]] + + assert np.array_equal(X_trans, expected_X_trans) + + # Dropping the first column works as expected + ohe = OneHotEncoder(categories='auto', max_levels=2, drop='first') + X_trans = ohe.fit_transform(X).toarray() + expected_X_trans = [[0, 1, 0, 0, 1], + [0, 0, 0, 0, 0], + [1, 0, 1, 0, 1], + [1, 0, 0, 1, 0], + [1, 0, 0, 1, 1], + [1, 0, 0, 1, 0], + [0, 0, 1, 0, 1], + [0, 1, 1, 0, 0], + [1, 0, 0, 0, 1]] + assert np.array_equal(X_trans, expected_X_trans) + + # Dropping explicit categories works as expected + ohe = OneHotEncoder(categories='auto', max_levels=2, drop=[3, 5, 1]) + X_trans = ohe.fit_transform(X).toarray() + expected_X_trans = [[0, 1, 1, 0, 0], + [1, 0, 1, 0, 1], + [0, 0, 0, 0, 0], + [0, 0, 0, 1, 1], + [0, 0, 0, 1, 0], + [0, 0, 0, 1, 1], + [1, 0, 0, 0, 0], + [0, 1, 0, 0, 1], + [0, 0, 1, 0, 0]] + assert np.array_equal(X_trans, expected_X_trans) + + # Dropping the infrequent categories works as expected + ohe = OneHotEncoder(categories='auto', max_levels=2, drop='infrequent') + X_trans = ohe.fit_transform(X).toarray() + expected_X_trans = [[0, 0, 1, 0, 0, 1], + [1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1], + [0, 1, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 1], + [0, 1, 0, 0, 1, 0], + [1, 0, 0, 1, 0, 1], + [0, 0, 0, 1, 1, 0], + [0, 1, 1, 0, 0, 1]] + + assert np.array_equal(X_trans, expected_X_trans) + + # Manually dropping a category that is infrequent is not allowed + ohe = OneHotEncoder(categories='auto', max_levels=2, drop=[3, 1, 1]) + err_msg = ("Category 1 of feature 1 is infrequent and thus cannot be " + "dropped") + with pytest.raises(ValueError, match=err_msg): + X_trans = ohe.fit(X) + + @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) def test_encoders_has_categorical_tags(Encoder): assert 'categorical' in Encoder()._get_tags()['X_types']