Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG] Add support for infrequent categories in OneHotEncoder and OrdinalEncoder #13833

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 133 additions & 17 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# Joris Van den Bossche <[email protected]>
# License: BSD 3 clause

from numbers import Integral

import numpy as np
from scipy import sparse

Expand All @@ -10,7 +12,7 @@
from ..utils.fixes import _argmax
from ..utils.validation import check_is_fitted

from .label import _encode, _encode_check_unknown
from .label import _encode, _encode_check_unknown, _encode_numpy
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can't we just use _encode rather than _encode_numpy?



__all__ = [
Expand Down Expand Up @@ -78,7 +80,15 @@ def _fit(self, X, handle_unknown='error'):
raise ValueError("Shape mismatch: if categories is an array,"
" it has to be of shape (n_features,).")

if self.max_levels is not None:
if (not isinstance(self.max_levels, Integral) or
self.max_levels <= 0):
raise ValueError("max_levels must be None or a strictly "
"positive int, got {}.".format(
self.max_levels))

self.categories_ = []
self.infrequent_indices_ = []

for i in range(n_features):
Xi = X_list[i]
Expand All @@ -98,6 +108,18 @@ def _fit(self, X, handle_unknown='error'):
raise ValueError(msg)
self.categories_.append(cats)

if self.max_levels is not None:
infrequent_indices = self._find_infrequent_category_indices(Xi)
else:
infrequent_indices = np.array([])
self.infrequent_indices_.append(infrequent_indices)

def _find_infrequent_category_indices(self, Xi):
# TODO: this is using unique on X again. Ideally we should integrate
# this into _encode()
_, counts = np.unique(Xi, return_counts=True)
return np.argsort(counts)[:-self.max_levels]

def _transform(self, X, handle_unknown='error'):
X_list, n_samples, n_features = self._check_X(X)

Expand Down Expand Up @@ -142,6 +164,29 @@ def _transform(self, X, handle_unknown='error'):
check_unknown=False)
X_int[:, i] = encoded

# We need to take care of infrequent categories here. We want all the
# infrequent categories to end up in a specific column, after all the
# frequent ones. Let's say we have 4 categories with 2 infrequent
# categories (and 2 frequent categories): we want the value in X_int
# for the infrequent categories to be 2 (third and last column), and
# the values for the frequent ones to be 0 and 1. The piece of code
# below performs this mapping.
# TODO: maybe integrate this part with the one above
self._infrequent_mappings = {}
huge_int = np.iinfo(X_int.dtype).max
for feature_idx in range(n_features):
if self.infrequent_indices_[feature_idx].size > 0:
mapping = np.arange(len(self.categories_[feature_idx]))
# Trick: set the infrequent cats columns to a very big int and
# encode again.
for ordinal_cat in self.infrequent_indices_[feature_idx]:
mapping[ordinal_cat] = huge_int
_, mapping = _encode_numpy(mapping, encode=True)

# update X_int and save mapping for later (for dropping logic)
X_int[:, feature_idx] = mapping[X_int[:, feature_idx]]
self._infrequent_mappings[feature_idx] = mapping

return X_int, X_mask

def _more_tags(self):
Expand Down Expand Up @@ -193,7 +238,11 @@ class OneHotEncoder(_BaseEncoder):
- 'first' : drop the first category in each feature. If only one
category is present, the feature will be dropped entirely.
- array : ``drop[i]`` is the category in feature ``X[:, i]`` that
should be dropped.
should be dropped. If ``drop[i]`` is an infrequent category, an
error is raised: it is only possible to drop all of the infrequent
categories, not just one of them.
- 'infrequent' : drop the infrequent categories column (see
``max_levels`` parameter).

sparse : boolean, default=True
Will return sparse matrix if set True else will return an array.
Expand All @@ -209,6 +258,10 @@ class OneHotEncoder(_BaseEncoder):
will be all zeros. In the inverse transform, an unknown category
will be denoted as None.

max_levels : int, default=None
Maximum number of categories to keep. Infrequent categories are
grouped together and mapped into a single column.

Attributes
----------
categories_ : list of arrays
Expand All @@ -222,6 +275,10 @@ class OneHotEncoder(_BaseEncoder):
be dropped for each feature. None if all the transformed features will
be retained.

infrequent_indices_: list of arrays of shape(n_infrequent_categories)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
infrequent_indices_: list of arrays of shape(n_infrequent_categories)
infrequent_indices_ : list of arrays of shape (n_infrequent_categories,)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
infrequent_indices_: list of arrays of shape(n_infrequent_categories)
infrequent_indices_ : list of arrays of shape (n_infrequent_categories,)

``infrequent_indices_[i]`` contains a list of indices in
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
``infrequent_indices_[i]`` contains a list of indices in
``infrequent_indices_[i]`` contains an array of indices in

``categories_[i]`` corresponding to the infrequent categories.

Examples
--------
Given a dataset with two features, we let the encoder find the unique
Expand Down Expand Up @@ -266,12 +323,13 @@ class OneHotEncoder(_BaseEncoder):
"""

def __init__(self, categories='auto', drop=None, sparse=True,
dtype=np.float64, handle_unknown='error'):
dtype=np.float64, handle_unknown='error', max_levels=None):
self.categories = categories
self.sparse = sparse
self.dtype = dtype
self.handle_unknown = handle_unknown
self.drop = drop
self.max_levels = max_levels

def _validate_keywords(self):
if self.handle_unknown not in ('error', 'ignore'):
Expand All @@ -290,7 +348,8 @@ def _validate_keywords(self):
def _compute_drop_idx(self):
if self.drop is None:
return None
elif (isinstance(self.drop, str) and self.drop == 'first'):
elif (isinstance(self.drop, str) and
self.drop in ('first', 'infrequent')):
return np.zeros(len(self.categories_), dtype=np.int_)
elif not isinstance(self.drop, str):
try:
Expand Down Expand Up @@ -338,6 +397,20 @@ def fit(self, X, y=None):
self._validate_keywords()
self._fit(X, handle_unknown=self.handle_unknown)
self.drop_idx_ = self._compute_drop_idx()
# check if user wants to manually drop a feature that is
# infrequent: this is not allowed
if self.drop is not None and not isinstance(self.drop, str):
for feature_idx, (infrequent_indices, drop_idx) in enumerate(
zip(self.infrequent_indices_, self.drop_idx_)):
if drop_idx in infrequent_indices:
raise ValueError(
"Category {} of feature {} is infrequent and thus "
"cannot be dropped. Use drop='infrequent' "
"instead.".format(
self.categories_[feature_idx][drop_idx],
feature_idx
)
)
return self

def fit_transform(self, X, y=None):
Expand Down Expand Up @@ -374,24 +447,58 @@ def transform(self, X):
check_is_fitted(self, 'categories_')
# validation of X happens in _check_X called by _transform
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)

n_samples, n_features = X_int.shape

if self.drop is not None:
to_drop = self.drop_idx_.reshape(1, -1)

# We remove all the dropped categories from mask, and decrement all
# categories that occur after them to avoid an empty column.
# n_columns indicates, for each feature, how many columns are used in
# X_trans. By default this corresponds to the number of categories, but
# will differ if we drop some of them, or if there are infrequent
# categories (all mapped to the same column)
n_columns = [len(cats) for cats in self.categories_]
for feature_idx in range(n_features):
n_infrequent = self.infrequent_indices_[feature_idx].size
if n_infrequent > 0:
# still add 1 for the infrequent column
n_columns[feature_idx] += 1 - n_infrequent
if self.drop is not None:
# if drop is not None we always drop one column in general,
# except when drop is 'infrequent' and there is no infrequent
# category.
n_columns[feature_idx] -= 1
if (isinstance(self.drop, str) and self.drop == 'infrequent'
and n_infrequent == 0):
n_columns[feature_idx] += 1 # revert decrement from above

keep_cells = X_int != to_drop
if self.drop is not None:
to_drop = self.drop_idx_.copy()

if isinstance(self.drop, str):
if self.drop == 'infrequent':
for feature_idx in range(n_features):
if self.infrequent_indices_[feature_idx].size > 0:
# drop the infrequent column (i.e. the last one)
to_drop[feature_idx] = n_columns[feature_idx]
else:
# no infrequent category, use special marker -1
# so that no dropping happens for this feature
to_drop[feature_idx] = -1
else:
# self.drop is an array of categories. we need to remap the
# dropped indexes if some of the categories are infrequent.
# see _transform() for details about the mapping.
for feature_idx in range(n_features):
if self.infrequent_indices_[feature_idx].size > 0:
mapping = self._infrequent_mappings[feature_idx]
to_drop[feature_idx] = mapping[to_drop[feature_idx]]

# We remove all the dropped categories from mask, and decrement
# all categories that occur after them to avoid an empty column.
to_drop = to_drop.reshape(1, -1)
keep_cells = (X_int != to_drop) | (to_drop == -1)
X_mask &= keep_cells
X_int[X_int > to_drop] -= 1
n_values = [len(cats) - 1 for cats in self.categories_]
else:
n_values = [len(cats) for cats in self.categories_]
X_int[(X_int > to_drop) & (to_drop != -1)] -= 1

mask = X_mask.ravel()
n_values = np.array([0] + n_values)
n_values = np.array([0] + n_columns)
feature_indices = np.cumsum(n_values)
indices = (X_int + feature_indices[:-1]).ravel()[mask]
indptr = X_mask.sum(axis=1).cumsum()
Expand Down Expand Up @@ -552,13 +659,21 @@ class OrdinalEncoder(_BaseEncoder):
dtype : number type, default np.float64
Desired dtype of output.

max_levels : int, default=None
Maximum number of categories to keep. Infrequent categories are
grouped together and mapped to the highest int.

Attributes
----------
categories_ : list of arrays
The categories of each feature determined during fitting
(in order of the features in X and corresponding with the output
of ``transform``).

infrequent_indices_: list of arrays of shape(n_infrequent_categories)
``infrequent_indices_[i]`` contains a list of indices in
``categories_[i]`` corresponsing to the infrequent categories.

Examples
--------
Given a dataset with two features, we let the encoder find the unique
Expand Down Expand Up @@ -587,9 +702,10 @@ class OrdinalEncoder(_BaseEncoder):
between 0 and n_classes-1.
"""

def __init__(self, categories='auto', dtype=np.float64):
def __init__(self, categories='auto', dtype=np.float64, max_levels=None):
self.categories = categories
self.dtype = dtype
self.max_levels = max_levels

def fit(self, X, y=None):
"""Fit the OrdinalEncoder to X.
Expand Down
114 changes: 114 additions & 0 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,120 @@ def test_categories(density, drop):
assert ohe_test.drop_idx_.dtype == np.int_


def test_infrequent_categories_sanity():
# Not a super legit test for now.
# Mostly aimed at explaining how the infrequent categories are handled.

X = [[0, 0, 1],
[1, 0, 0],
[3, 5, 1],
[3, 1, 0],
[3, 1, 1],
[3, 2, 0],
[1, 5, 1],
[0, 5, 0],
[3, 0, 1]]
X = np.array(X)

# Check _infrequent_idx_ attribute
oe = OrdinalEncoder(max_levels=2)
X_trans = oe.fit_transform(X)
# first feature: category 0 is infrequent
# note that 1 is also infrequent but we want to keep 2 categories
assert len(oe.infrequent_indices_[0]) == 1
assert oe.categories_[0][oe.infrequent_indices_[0][0]] == 0
# second feature: categories 2 and 1 are infrequent
# 2 comes first because it has less occurrences than 1
assert len(oe.infrequent_indices_[1]) == 2
assert oe.categories_[1][oe.infrequent_indices_[1][0]] == 2
assert oe.categories_[1][oe.infrequent_indices_[1][1]] == 1
# third feature: no infrequent category
assert len(oe.infrequent_indices_[2]) == 0

# For ordinal encoder, the infrequent categories are assigned the highest
# integer.
expected_X_trans = [[2, 0, 1],
[0, 0, 0],
[1, 1, 1],
[1, 2, 0],
[1, 2, 1],
[1, 2, 0],
[0, 1, 1],
[2, 1, 0],
[1, 0, 1]]
assert np.array_equal(X_trans, expected_X_trans)

ohe = OneHotEncoder(categories='auto', max_levels=2)
X_trans = ohe.fit_transform(X).toarray()
# first feature: 1 is treated as infrequent and ends up in
# X_trans[:, 2]
# second feature: 1 and 2 are treated as infrequent and end up in
# X_trans[:, 5]
# third feature: no infrequent category. Represented by the 2 last
# columns
expected_X_trans = [[0, 0, 1, 1, 0, 0, 0, 1],
[1, 0, 0, 1, 0, 0, 1, 0],
[0, 1, 0, 0, 1, 0, 0, 1],
[0, 1, 0, 0, 0, 1, 1, 0],
[0, 1, 0, 0, 0, 1, 0, 1],
[0, 1, 0, 0, 0, 1, 1, 0],
[1, 0, 0, 0, 1, 0, 0, 1],
[0, 0, 1, 0, 1, 0, 1, 0],
[0, 1, 0, 1, 0, 0, 0, 1]]

assert np.array_equal(X_trans, expected_X_trans)

# Dropping the first column works as expected
ohe = OneHotEncoder(categories='auto', max_levels=2, drop='first')
X_trans = ohe.fit_transform(X).toarray()
expected_X_trans = [[0, 1, 0, 0, 1],
[0, 0, 0, 0, 0],
[1, 0, 1, 0, 1],
[1, 0, 0, 1, 0],
[1, 0, 0, 1, 1],
[1, 0, 0, 1, 0],
[0, 0, 1, 0, 1],
[0, 1, 1, 0, 0],
[1, 0, 0, 0, 1]]
assert np.array_equal(X_trans, expected_X_trans)

# Dropping explicit categories works as expected
ohe = OneHotEncoder(categories='auto', max_levels=2, drop=[3, 5, 1])
X_trans = ohe.fit_transform(X).toarray()
expected_X_trans = [[0, 1, 1, 0, 0],
[1, 0, 1, 0, 1],
[0, 0, 0, 0, 0],
[0, 0, 0, 1, 1],
[0, 0, 0, 1, 0],
[0, 0, 0, 1, 1],
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 1],
[0, 0, 1, 0, 0]]
assert np.array_equal(X_trans, expected_X_trans)

# Dropping the infrequent categories works as expected
ohe = OneHotEncoder(categories='auto', max_levels=2, drop='infrequent')
X_trans = ohe.fit_transform(X).toarray()
expected_X_trans = [[0, 0, 1, 0, 0, 1],
[1, 0, 1, 0, 1, 0],
[0, 1, 0, 1, 0, 1],
[0, 1, 0, 0, 1, 0],
[0, 1, 0, 0, 0, 1],
[0, 1, 0, 0, 1, 0],
[1, 0, 0, 1, 0, 1],
[0, 0, 0, 1, 1, 0],
[0, 1, 1, 0, 0, 1]]

assert np.array_equal(X_trans, expected_X_trans)

# Manually dropping a category that is infrequent is not allowed
ohe = OneHotEncoder(categories='auto', max_levels=2, drop=[3, 1, 1])
err_msg = ("Category 1 of feature 1 is infrequent and thus cannot be "
"dropped")
with pytest.raises(ValueError, match=err_msg):
X_trans = ohe.fit(X)


@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
def test_encoders_has_categorical_tags(Encoder):
assert 'categorical' in Encoder()._get_tags()['X_types']