scikit-learn · NicolasHug · May 6, 2019 · May 6, 2019 · May 7, 2019 · May 8, 2019
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -2,6 +2,8 @@
 #          Joris Van den Bossche <[email protected]>
 # License: BSD 3 clause
 
+from numbers import Integral
+
 import numpy as np
 from scipy import sparse
 
@@ -10,7 +12,7 @@
 from ..utils.fixes import _argmax
 from ..utils.validation import check_is_fitted
 
-from .label import _encode, _encode_check_unknown
+from .label import _encode, _encode_check_unknown, _encode_numpy
 
 
 __all__ = [
@@ -78,7 +80,15 @@ def _fit(self, X, handle_unknown='error'):
                 raise ValueError("Shape mismatch: if categories is an array,"
                                  " it has to be of shape (n_features,).")
 
+        if self.max_levels is not None:
+            if (not isinstance(self.max_levels, Integral) or
+                    self.max_levels <= 0):
+                raise ValueError("max_levels must be None or a strictly "
+                                 "positive int, got {}.".format(
+                                     self.max_levels))
+
         self.categories_ = []
+        self.infrequent_indices_ = []
 
         for i in range(n_features):
             Xi = X_list[i]
@@ -98,6 +108,18 @@ def _fit(self, X, handle_unknown='error'):
                         raise ValueError(msg)
             self.categories_.append(cats)
 
+            if self.max_levels is not None:
+                infrequent_indices = self._find_infrequent_category_indices(Xi)
+            else:
+                infrequent_indices = np.array([])
+            self.infrequent_indices_.append(infrequent_indices)
+
+    def _find_infrequent_category_indices(self, Xi):
+        # TODO: this is using unique on X again. Ideally we should integrate
+        # this into _encode()
+        _, counts = np.unique(Xi, return_counts=True)
+        return np.argsort(counts)[:-self.max_levels]
+
     def _transform(self, X, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
 
@@ -142,6 +164,29 @@ def _transform(self, X, handle_unknown='error'):
                                  check_unknown=False)
             X_int[:, i] = encoded
 
+        # We need to take care of infrequent categories here. We want all the
+        # infrequent categories to end up in a specific column, after all the
+        # frequent ones. Let's say we have 4 categories with 2 infrequent
+        # categories (and 2 frequent categories): we want the value in X_int
+        # for the infrequent categories to be 2 (third and last column), and
+        # the values for the frequent ones to be 0 and 1. The piece of code
+        # below performs this mapping.
+        # TODO: maybe integrate this part with the one above
+        self._infrequent_mappings = {}
+        huge_int = np.iinfo(X_int.dtype).max
+        for feature_idx in range(n_features):
+            if self.infrequent_indices_[feature_idx].size > 0:
+                mapping = np.arange(len(self.categories_[feature_idx]))
+                # Trick: set the infrequent cats columns to a very big int and
+                # encode again.
+                for ordinal_cat in self.infrequent_indices_[feature_idx]:
+                    mapping[ordinal_cat] = huge_int
+                _, mapping = _encode_numpy(mapping, encode=True)
+
+                # update X_int and save mapping for later (for dropping logic)
+                X_int[:, feature_idx] = mapping[X_int[:, feature_idx]]
+                self._infrequent_mappings[feature_idx] = mapping
+
         return X_int, X_mask
 
     def _more_tags(self):
@@ -193,7 +238,11 @@ class OneHotEncoder(_BaseEncoder):
         - 'first' : drop the first category in each feature. If only one
           category is present, the feature will be dropped entirely.
         - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
-          should be dropped.
+          should be dropped. If ``drop[i]`` is an infrequent category, an
+          error is raised: it is only possible to drop all of the infrequent
+          categories, not just one of them.
+        - 'infrequent' : drop the infrequent categories column (see
+          ``max_levels`` parameter).
 
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
@@ -209,6 +258,10 @@ class OneHotEncoder(_BaseEncoder):
         will be all zeros. In the inverse transform, an unknown category
         will be denoted as None.
 
+    max_levels : int, default=None
+        Maximum number of categories to keep. Infrequent categories are
+        grouped together and mapped into a single column.
+
     Attributes
     ----------
     categories_ : list of arrays
@@ -222,6 +275,10 @@ class OneHotEncoder(_BaseEncoder):
         be dropped for each feature. None if all the transformed features will
         be retained.
 
+    infrequent_indices_: list of arrays of shape(n_infrequent_categories)
-    infrequent_indices_: list of arrays of shape(n_infrequent_categories)
+    infrequent_indices_ : list of arrays of shape (n_infrequent_categories,)
-    infrequent_indices_: list of arrays of shape(n_infrequent_categories)
+    infrequent_indices_ : list of arrays of shape (n_infrequent_categories,)
-    infrequent_indices_: list of arrays of shape(n_infrequent_categories)
+    infrequent_indices_ : list of arrays of shape (n_infrequent_categories,)
-    infrequent_indices_: list of arrays of shape(n_infrequent_categories)
+    infrequent_indices_ : list of arrays of shape (n_infrequent_categories,)
+        ``infrequent_indices_[i]`` contains a list of indices in
-        ``infrequent_indices_[i]`` contains a list of indices in
+        ``infrequent_indices_[i]`` contains an array of indices in
-        ``infrequent_indices_[i]`` contains a list of indices in
+        ``infrequent_indices_[i]`` contains an array of indices in
+        ``categories_[i]`` corresponding to the infrequent categories.
+
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique
@@ -266,12 +323,13 @@ class OneHotEncoder(_BaseEncoder):
     """
 
     def __init__(self, categories='auto', drop=None, sparse=True,
-                 dtype=np.float64, handle_unknown='error'):
+                 dtype=np.float64, handle_unknown='error', max_levels=None):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
         self.handle_unknown = handle_unknown
         self.drop = drop
+        self.max_levels = max_levels
 
     def _validate_keywords(self):
         if self.handle_unknown not in ('error', 'ignore'):
@@ -290,7 +348,8 @@ def _validate_keywords(self):
     def _compute_drop_idx(self):
         if self.drop is None:
             return None
-        elif (isinstance(self.drop, str) and self.drop == 'first'):
+        elif (isinstance(self.drop, str) and
+                self.drop in ('first', 'infrequent')):
             return np.zeros(len(self.categories_), dtype=np.int_)
         elif not isinstance(self.drop, str):
             try:
@@ -338,6 +397,20 @@ def fit(self, X, y=None):
         self._validate_keywords()
         self._fit(X, handle_unknown=self.handle_unknown)
         self.drop_idx_ = self._compute_drop_idx()
+        # check if user wants to manually drop a feature that is
+        # infrequent: this is not allowed
+        if self.drop is not None and not isinstance(self.drop, str):
+            for feature_idx, (infrequent_indices, drop_idx) in enumerate(
+                    zip(self.infrequent_indices_, self.drop_idx_)):
+                if drop_idx in infrequent_indices:
+                    raise ValueError(
+                        "Category {} of feature {} is infrequent and thus "
+                        "cannot be dropped. Use drop='infrequent' "
+                        "instead.".format(
+                            self.categories_[feature_idx][drop_idx],
+                            feature_idx
+                        )
+                    )
         return self
 
     def fit_transform(self, X, y=None):
@@ -374,24 +447,58 @@ def transform(self, X):
         check_is_fitted(self, 'categories_')
         # validation of X happens in _check_X called by _transform
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
-
         n_samples, n_features = X_int.shape
 
-        if self.drop is not None:
-            to_drop = self.drop_idx_.reshape(1, -1)
-
-            # We remove all the dropped categories from mask, and decrement all
-            # categories that occur after them to avoid an empty column.
+        # n_columns indicates, for each feature, how many columns are used in
+        # X_trans. By default this corresponds to the number of categories, but
+        # will differ if we drop some of them, or if there are infrequent
+        # categories (all mapped to the same column)
+        n_columns = [len(cats) for cats in self.categories_]
+        for feature_idx in range(n_features):
+            n_infrequent = self.infrequent_indices_[feature_idx].size
+            if n_infrequent > 0:
+                # still add 1 for the infrequent column
+                n_columns[feature_idx] += 1 - n_infrequent
+            if self.drop is not None:
+                # if drop is not None we always drop one column in general,
+                # except when drop is 'infrequent' and there is no infrequent
+                # category.
+                n_columns[feature_idx] -= 1
+                if (isinstance(self.drop, str) and self.drop == 'infrequent'
+                        and n_infrequent == 0):
+                    n_columns[feature_idx] += 1  # revert decrement from above
 
-            keep_cells = X_int != to_drop
+        if self.drop is not None:
+            to_drop = self.drop_idx_.copy()
+
+            if isinstance(self.drop, str):
+                if self.drop == 'infrequent':
+                    for feature_idx in range(n_features):
+                        if self.infrequent_indices_[feature_idx].size > 0:
+                            # drop the infrequent column (i.e. the last one)
+                            to_drop[feature_idx] = n_columns[feature_idx]
+                        else:
+                            # no infrequent category, use special marker -1
+                            # so that no dropping happens for this feature
+                            to_drop[feature_idx] = -1
+            else:
+                # self.drop is an array of categories. we need to remap the
+                # dropped indexes if some of the categories are infrequent.
+                # see _transform() for details about the mapping.
+                for feature_idx in range(n_features):
+                    if self.infrequent_indices_[feature_idx].size > 0:
+                        mapping = self._infrequent_mappings[feature_idx]
+                        to_drop[feature_idx] = mapping[to_drop[feature_idx]]
+
+            # We remove all the dropped categories from mask, and decrement
+            # all categories that occur after them to avoid an empty column.
+            to_drop = to_drop.reshape(1, -1)
+            keep_cells = (X_int != to_drop) | (to_drop == -1)
             X_mask &= keep_cells
-            X_int[X_int > to_drop] -= 1
-            n_values = [len(cats) - 1 for cats in self.categories_]
-        else:
-            n_values = [len(cats) for cats in self.categories_]
+            X_int[(X_int > to_drop) & (to_drop != -1)] -= 1
 
         mask = X_mask.ravel()
-        n_values = np.array([0] + n_values)
+        n_values = np.array([0] + n_columns)
         feature_indices = np.cumsum(n_values)
         indices = (X_int + feature_indices[:-1]).ravel()[mask]
         indptr = X_mask.sum(axis=1).cumsum()
@@ -552,13 +659,21 @@ class OrdinalEncoder(_BaseEncoder):
     dtype : number type, default np.float64
         Desired dtype of output.
 
+    max_levels : int, default=None
+        Maximum number of categories to keep. Infrequent categories are
+        grouped together and mapped to the highest int.
+
     Attributes
     ----------
     categories_ : list of arrays
         The categories of each feature determined during fitting
         (in order of the features in X and corresponding with the output
         of ``transform``).
 
+    infrequent_indices_: list of arrays of shape(n_infrequent_categories)
+        ``infrequent_indices_[i]`` contains a list of indices in
+        ``categories_[i]`` corresponsing to the infrequent categories.
+
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique
@@ -587,9 +702,10 @@ class OrdinalEncoder(_BaseEncoder):
       between 0 and n_classes-1.
     """
 
-    def __init__(self, categories='auto', dtype=np.float64):
+    def __init__(self, categories='auto', dtype=np.float64, max_levels=None):
         self.categories = categories
         self.dtype = dtype
+        self.max_levels = max_levels
 
     def fit(self, X, y=None):
         """Fit the OrdinalEncoder to X.

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -673,6 +673,120 @@ def test_categories(density, drop):
     assert ohe_test.drop_idx_.dtype == np.int_
 
 
+def test_infrequent_categories_sanity():
+    # Not a super legit test for now.
+    # Mostly aimed at explaining how the infrequent categories are handled.
+
+    X = [[0, 0, 1],
+         [1, 0, 0],
+         [3, 5, 1],
+         [3, 1, 0],
+         [3, 1, 1],
+         [3, 2, 0],
+         [1, 5, 1],
+         [0, 5, 0],
+         [3, 0, 1]]
+    X = np.array(X)
+
+    # Check _infrequent_idx_ attribute
+    oe = OrdinalEncoder(max_levels=2)
+    X_trans = oe.fit_transform(X)
+    # first feature: category 0 is infrequent
+    # note that 1 is also infrequent but we want to keep 2 categories
+    assert len(oe.infrequent_indices_[0]) == 1
+    assert oe.categories_[0][oe.infrequent_indices_[0][0]] == 0
+    # second feature: categories 2 and 1 are infrequent
+    # 2 comes first because it has less occurrences than 1
+    assert len(oe.infrequent_indices_[1]) == 2
+    assert oe.categories_[1][oe.infrequent_indices_[1][0]] == 2
+    assert oe.categories_[1][oe.infrequent_indices_[1][1]] == 1
+    # third feature: no infrequent category
+    assert len(oe.infrequent_indices_[2]) == 0
+
+    # For ordinal encoder, the infrequent categories are assigned the highest
+    # integer.
+    expected_X_trans = [[2, 0, 1],
+                        [0, 0, 0],
+                        [1, 1, 1],
+                        [1, 2, 0],
+                        [1, 2, 1],
+                        [1, 2, 0],
+                        [0, 1, 1],
+                        [2, 1, 0],
+                        [1, 0, 1]]
+    assert np.array_equal(X_trans, expected_X_trans)
+
+    ohe = OneHotEncoder(categories='auto', max_levels=2)
+    X_trans = ohe.fit_transform(X).toarray()
+    # first feature: 1 is treated as infrequent and ends up in
+    # X_trans[:, 2]
+    # second feature: 1 and 2 are treated as infrequent and end up in
+    # X_trans[:, 5]
+    # third feature: no infrequent category. Represented by the 2 last
+    # columns
+    expected_X_trans = [[0, 0, 1,   1, 0, 0,   0, 1],
+                        [1, 0, 0,   1, 0, 0,   1, 0],
+                        [0, 1, 0,   0, 1, 0,   0, 1],
+                        [0, 1, 0,   0, 0, 1,   1, 0],
+                        [0, 1, 0,   0, 0, 1,   0, 1],
+                        [0, 1, 0,   0, 0, 1,   1, 0],
+                        [1, 0, 0,   0, 1, 0,   0, 1],
+                        [0, 0, 1,   0, 1, 0,   1, 0],
+                        [0, 1, 0,   1, 0, 0,   0, 1]]
+
+    assert np.array_equal(X_trans, expected_X_trans)
+
+    # Dropping the first column works as expected
+    ohe = OneHotEncoder(categories='auto', max_levels=2, drop='first')
+    X_trans = ohe.fit_transform(X).toarray()
+    expected_X_trans = [[0, 1,   0, 0,   1],
+                        [0, 0,   0, 0,   0],
+                        [1, 0,   1, 0,   1],
+                        [1, 0,   0, 1,   0],
+                        [1, 0,   0, 1,   1],
+                        [1, 0,   0, 1,   0],
+                        [0, 0,   1, 0,   1],
+                        [0, 1,   1, 0,   0],
+                        [1, 0,   0, 0,   1]]
+    assert np.array_equal(X_trans, expected_X_trans)
+
+    # Dropping explicit categories works as expected
+    ohe = OneHotEncoder(categories='auto', max_levels=2, drop=[3, 5, 1])
+    X_trans = ohe.fit_transform(X).toarray()
+    expected_X_trans = [[0, 1,   1, 0,   0],
+                        [1, 0,   1, 0,   1],
+                        [0, 0,   0, 0,   0],
+                        [0, 0,   0, 1,   1],
+                        [0, 0,   0, 1,   0],
+                        [0, 0,   0, 1,   1],
+                        [1, 0,   0, 0,   0],
+                        [0, 1,   0, 0,   1],
+                        [0, 0,   1, 0,   0]]
+    assert np.array_equal(X_trans, expected_X_trans)
+
+    # Dropping the infrequent categories works as expected
+    ohe = OneHotEncoder(categories='auto', max_levels=2, drop='infrequent')
+    X_trans = ohe.fit_transform(X).toarray()
+    expected_X_trans = [[0, 0,   1, 0,   0, 1],
+                        [1, 0,   1, 0,   1, 0],
+                        [0, 1,   0, 1,   0, 1],
+                        [0, 1,   0, 0,   1, 0],
+                        [0, 1,   0, 0,   0, 1],
+                        [0, 1,   0, 0,   1, 0],
+                        [1, 0,   0, 1,   0, 1],
+                        [0, 0,   0, 1,   1, 0],
+                        [0, 1,   1, 0,   0, 1]]
+
+    assert np.array_equal(X_trans, expected_X_trans)
+
+    # Manually dropping a category that is infrequent is not allowed
+    ohe = OneHotEncoder(categories='auto', max_levels=2, drop=[3, 1, 1])
+    err_msg = ("Category 1 of feature 1 is infrequent and thus cannot be "
+               "dropped")
+    with pytest.raises(ValueError, match=err_msg):
+        X_trans = ohe.fit(X)
+
+
 @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
 def test_encoders_has_categorical_tags(Encoder):
     assert 'categorical' in Encoder()._get_tags()['X_types']