From 9703dd2c4558bfc10e14d7c09c83cd6dd69e6312 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 12 Sep 2019 19:39:20 -0400 Subject: [PATCH 1/9] CLN Refactors encoders --- sklearn/preprocessing/_encoders.py | 759 +++++++++++++++++------------ 1 file changed, 460 insertions(+), 299 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 0ee5d32720e63..d4609553f11ad 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -4,9 +4,11 @@ import numpy as np from scipy import sparse +from itertools import count from ..base import BaseEstimator, TransformerMixin from ..utils import check_array +from ..utils import safe_indexing from ..utils.fixes import _argmax from ..utils.validation import check_is_fitted @@ -19,16 +21,23 @@ ] -class _BaseEncoder(TransformerMixin, BaseEstimator): - """ - Base class for encoders that includes the code to categorize and +class _EncoderUnion(TransformerMixin, BaseEstimator): + """Base class for encoders that includes the code to categorize and transform the input features. + The encoders passed to `_fit_list` must define: + + 1. `fit(X)` where `X` is a ndarray of shape (n_samples,) + 2. `transform(X)` where `X` is a ndarray of shape (n_samples,) + 3. `inverse_transform(X)` where `X` is an encoded ndarray of sparse array """ + @property + def categories_(self): + return [encoder.categories_ for encoder in self._encoders] + def _check_X(self, X): - """ - Perform custom check_array: + """Perform custom check_array: - convert list of strings to object dtype - check for missing values for object dtype data (check_array does not do that) @@ -52,103 +61,111 @@ def _check_X(self, X): # to keep the dtype information to be used in the encoder. needs_validation = True - n_samples, n_features = X.shape + n_features = X.shape[1] X_columns = [] for i in range(n_features): - Xi = self._get_feature(X, feature_idx=i) + Xi = safe_indexing(X, i, axis=1) Xi = check_array(Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation) X_columns.append(Xi) - return X_columns, n_samples, n_features + return X_columns - def _get_feature(self, X, feature_idx): - if hasattr(X, 'iloc'): - # pandas dataframes - return X.iloc[:, feature_idx] - # numpy arrays, sparse arrays - return X[:, feature_idx] + def _check_categories(self, n_features): + """Check categories are consistent with n_features""" + categories = self.categories - def _fit(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) + if (self.categories != 'auto' and + len(self.categories) != n_features): + raise ValueError("Shape mismatch: if categories is an array, " + "it has to be of shape (n_features,).") - if self.categories != 'auto': - if len(self.categories) != n_features: - raise ValueError("Shape mismatch: if categories is an array," - " it has to be of shape (n_features,).") + if isinstance(self.categories, str) and self.categories == 'auto': + categories = ['auto'] * n_features + else: + categories = self.categories - self.categories_ = [] + return categories - for i in range(n_features): - Xi = X_list[i] - if self.categories == 'auto': - cats = _encode(Xi) - else: - cats = np.array(self.categories[i], dtype=Xi.dtype) - if Xi.dtype != object: - if not np.all(np.sort(cats) == cats): - raise ValueError("Unsorted categories are not " - "supported for numerical categories") - if handle_unknown == 'error': - diff = _encode_check_unknown(Xi, cats) - if diff: - msg = ("Found unknown categories {0} in column {1}" - " during fit".format(diff, i)) - raise ValueError(msg) - self.categories_.append(cats) - - def _transform(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) - - X_int = np.zeros((n_samples, n_features), dtype=np.int) - X_mask = np.ones((n_samples, n_features), dtype=np.bool) + def _fit_list(self, X_list, encoders): + """Fit encoders on X_list""" + assert len(X_list) == len(encoders) + + for X_col, encoder in zip(X_list, encoders): + encoder.fit(X_col) + # map from X_trans indicies to indicies from the original X + X_trans_to_orig_idx = [] + X_trans_idx = 0 + for encoder in encoders: + n_feat_out = encoder.n_features_out_ + begin, end = X_trans_idx, X_trans_idx + n_feat_out + X_trans_to_orig_idx.append((begin, end)) + X_trans_idx += n_feat_out + + self._X_trans_to_orig_idx = X_trans_to_orig_idx + self._encoders = encoders + + def _transform_list(self, X_list): + """Transform X_list with encoders""" + n_features = len(X_list) if n_features != len(self.categories_): raise ValueError( "The number of features in X is different to the number of " "features of the fitted data. The fitted data had {} features " "and the X has {} features." - .format(len(self.categories_,), n_features) - ) + .format(len(self.categories_,), n_features)) - for i in range(n_features): - Xi = X_list[i] - diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], - return_mask=True) - - if not np.all(valid_mask): - if handle_unknown == 'error': - msg = ("Found unknown categories {0} in column {1}" - " during transform".format(diff, i)) - raise ValueError(msg) - else: - # Set the problematic rows to an acceptable value and - # continue `The rows are marked `X_mask` and will be - # removed later. - X_mask[:, i] = valid_mask - # cast Xi into the largest string type necessary - # to handle different lengths of numpy strings - if (self.categories_[i].dtype.kind in ('U', 'S') - and self.categories_[i].itemsize > Xi.itemsize): - Xi = Xi.astype(self.categories_[i].dtype) - else: - Xi = Xi.copy() - - Xi[~valid_mask] = self.categories_[i][0] - # We use check_unknown=False, since _encode_check_unknown was - # already called above. - _, encoded = _encode(Xi, self.categories_[i], encode=True, - check_unknown=False) - X_int[:, i] = encoded - - return X_int, X_mask + X_trs = [] + for encoder, X_col in zip(self._encoders, X_list): + X_trs.append(encoder.transform(X_col)) + return self._hstack(X_trs) + + def _hstack(self, Xs): + if any(sparse.issparse(X_tran) for X_tran in Xs): + Xs = sparse.hstack(Xs).tocsr() + else: + Xs = np.hstack(Xs) + return Xs + + def inverse_transform(self, X): + """Convert the data back into the original representation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape \ + (n_samples, n_encoded_features) + The transformed data. + + Returns + ------- + X_tr : ndarray of shape (n_samples, n_features) + Inverse transformed array. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse='csr') + + n_features = sum(encoder.n_features_out_ for encoder in self._encoders) + + # validate shape of passed X + msg = ("Shape of the passed X data is not correct. Expected {0} " + "columns, got {1}.") + if X.shape[1] != n_features: + raise ValueError(msg.format(n_features, X.shape[1])) + + X_trs = [] + for encoder, (begin, end) in zip(self._encoders, + self._X_trans_to_orig_idx): + X_slice = safe_indexing(X, slice(begin, end), axis=1) + X_trs.append(encoder.inverse_transform(X_slice)) + return self._hstack(X_trs) def _more_tags(self): return {'X_types': ['categorical']} -class OneHotEncoder(_BaseEncoder): +class OneHotEncoder(_EncoderUnion): """Encode categorical features as a one-hot numeric array. The input to this transformer should be an array-like of integers or @@ -264,233 +281,105 @@ class OneHotEncoder(_BaseEncoder): iterables and a multilabel format, e.g. a (samples x classes) binary matrix indicating the presence of a class label. """ - def __init__(self, categories='auto', drop=None, sparse=True, dtype=np.float64, handle_unknown='error'): + self.categories = categories self.sparse = sparse self.dtype = dtype self.handle_unknown = handle_unknown self.drop = drop - def _validate_keywords(self): - if self.handle_unknown not in ('error', 'ignore'): - msg = ("handle_unknown should be either 'error' or 'ignore', " - "got {0}.".format(self.handle_unknown)) + @property + def drop_idx_(self): + return np.array([encoder.drop_idx_ for encoder in self._encoders]) + + def _fit(self, X): + """Validate keywords and fit `X` and return `X_list`.""" + self._validate_keywords() + X_list = self._check_X(X) + n_features = len(X_list) + + categories = self._check_categories(n_features) + drop_kwargs = self._check_drop(n_features) + + encoders = [ + _SingleOneHotEncoder(categories=cat, + dtype=self.dtype, + handle_unknown=self.handle_unknown, + sparse=self.sparse, + feature_idx=idx, + **drop_kwarg) + for idx, cat, drop_kwarg in zip(count(), categories, drop_kwargs)] + + self._fit_list(X_list, encoders) + + # validate encoders + missing_drops = [] + for idx, encoder in enumerate(encoders): + drop_idx = encoder.drop_idx_ + if isinstance(drop_idx, str) and drop_idx == 'missing': + missing_drops.append((idx, encoder.drop)) + + if any(missing_drops): + msg = ("The following categories were supposed to be " + "dropped, but were not found in the training " + "data.\n{}".format("\n".join( + ["Category: {}, Feature: {}".format(c, v) + for c, v in missing_drops]))) raise ValueError(msg) - # If we have both dropped columns and ignored unknown - # values, there will be ambiguous cells. This creates difficulties - # in interpreting the model. - if self.drop is not None and self.handle_unknown != 'error': - raise ValueError( - "`handle_unknown` must be 'error' when the drop parameter is " - "specified, as both would create categories that are all " - "zero.") - def _compute_drop_idx(self): - if self.drop is None: - return None - elif (isinstance(self.drop, str) and self.drop == 'first'): - return np.zeros(len(self.categories_), dtype=np.int_) - elif not isinstance(self.drop, str): - try: - self.drop = np.asarray(self.drop, dtype=object) - droplen = len(self.drop) - except (ValueError, TypeError): - msg = ("Wrong input for parameter `drop`. Expected " - "'first', None or array of objects, got {}") - raise ValueError(msg.format(type(self.drop))) - if droplen != len(self.categories_): - msg = ("`drop` should have length equal to the number " - "of features ({}), got {}") - raise ValueError(msg.format(len(self.categories_), - len(self.drop))) - missing_drops = [(i, val) for i, val in enumerate(self.drop) - if val not in self.categories_[i]] - if any(missing_drops): - msg = ("The following categories were supposed to be " - "dropped, but were not found in the training " - "data.\n{}".format( - "\n".join( - ["Category: {}, Feature: {}".format(c, v) - for c, v in missing_drops]))) - raise ValueError(msg) - return np.array([np.where(cat_list == val)[0][0] - for (val, cat_list) in - zip(self.drop, self.categories_)], dtype=np.int_) - else: - msg = ("Wrong input for parameter `drop`. Expected " - "'first', None or array of objects, got {}") - raise ValueError(msg.format(type(self.drop))) + return X_list def fit(self, X, y=None): """Fit OneHotEncoder to X. Parameters ---------- - X : array-like, shape [n_samples, n_features] - The data to determine the categories of each feature. + X : array-like of shape (n_samples, n_features) + The data to determine the categroies of each feature. Returns ------- self """ - self._validate_keywords() - self._fit(X, handle_unknown=self.handle_unknown) - self.drop_idx_ = self._compute_drop_idx() + self._fit(X) return self - def fit_transform(self, X, y=None): - """Fit OneHotEncoder to X, then transform X. - - Equivalent to fit(X).transform(X) but more convenient. - - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - The data to encode. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. - """ - self._validate_keywords() - return super().fit_transform(X, y) - def transform(self, X): - """Transform X using one-hot encoding. + """Transform X to encoding Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like of shape (n_samples, n_features) The data to encode. Returns ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. + X_out : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Transformed array. When `sparse=True`, `X_out` is a sparse array. """ check_is_fitted(self) - # validation of X happens in _check_X called by _transform - X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) - - n_samples, n_features = X_int.shape - - if self.drop is not None: - to_drop = self.drop_idx_.reshape(1, -1) - - # We remove all the dropped categories from mask, and decrement all - # categories that occur after them to avoid an empty column. + X_list = self._check_X(X) + return self._transform_list(X_list) - keep_cells = X_int != to_drop - X_mask &= keep_cells - X_int[X_int > to_drop] -= 1 - n_values = [len(cats) - 1 for cats in self.categories_] - else: - n_values = [len(cats) for cats in self.categories_] - - mask = X_mask.ravel() - n_values = np.array([0] + n_values) - feature_indices = np.cumsum(n_values) - indices = (X_int + feature_indices[:-1]).ravel()[mask] - indptr = X_mask.sum(axis=1).cumsum() - indptr = np.insert(indptr, 0, 0) - data = np.ones(n_samples * n_features)[mask] - - out = sparse.csr_matrix((data, indices, indptr), - shape=(n_samples, feature_indices[-1]), - dtype=self.dtype) - if not self.sparse: - return out.toarray() - else: - return out - - def inverse_transform(self, X): - """Convert the back data to the original representation. - - In case unknown categories are encountered (all zeros in the - one-hot encoding), ``None`` is used to represent this category. + def fit_transform(self, X, y=None): + """Fit encoder to X and transform X. Parameters ---------- - X : array-like or sparse matrix, shape [n_samples, n_encoded_features] - The transformed data. + X : array-like of shape (n_samples, n_features) + The data to encode. Returns ------- - X_tr : array-like, shape [n_samples, n_features] - Inverse transformed array. - + X_out : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Transformed array. When `sparse=True`, `X_out` is a sparse array. """ - check_is_fitted(self) - X = check_array(X, accept_sparse='csr') - - n_samples, _ = X.shape - n_features = len(self.categories_) - if self.drop is None: - n_transformed_features = sum(len(cats) - for cats in self.categories_) - else: - n_transformed_features = sum(len(cats) - 1 - for cats in self.categories_) - - # validate shape of passed X - msg = ("Shape of the passed X data is not correct. Expected {0} " - "columns, got {1}.") - if X.shape[1] != n_transformed_features: - raise ValueError(msg.format(n_transformed_features, X.shape[1])) - - # create resulting array of appropriate dtype - dt = np.find_common_type([cat.dtype for cat in self.categories_], []) - X_tr = np.empty((n_samples, n_features), dtype=dt) - - j = 0 - found_unknown = {} - - for i in range(n_features): - if self.drop is None: - cats = self.categories_[i] - else: - cats = np.delete(self.categories_[i], self.drop_idx_[i]) - n_categories = len(cats) - - # Only happens if there was a column with a unique - # category. In this case we just fill the column with this - # unique category value. - if n_categories == 0: - X_tr[:, i] = self.categories_[i][self.drop_idx_[i]] - j += n_categories - continue - sub = X[:, j:j + n_categories] - # for sparse X argmax returns 2D matrix, ensure 1D array - labels = np.asarray(_argmax(sub, axis=1)).flatten() - X_tr[:, i] = cats[labels] - if self.handle_unknown == 'ignore': - unknown = np.asarray(sub.sum(axis=1) == 0).flatten() - # ignored unknown categories: we have a row of all zero - if unknown.any(): - found_unknown[i] = unknown - # drop will either be None or handle_unknown will be error. If - # self.drop is not None, then we can safely assume that all of - # the nulls in each column are the dropped value - elif self.drop is not None: - dropped = np.asarray(sub.sum(axis=1) == 0).flatten() - if dropped.any(): - X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]] - - j += n_categories - - # if ignored are found: potentially need to upcast result to - # insert None values - if found_unknown: - if X_tr.dtype != object: - X_tr = X_tr.astype(object) - - for idx, mask in found_unknown.items(): - X_tr[mask, idx] = None - - return X_tr + X_list = self._fit(X) + return self._transform_list(X_list) def get_feature_names(self, input_features=None): """Return feature names for output features. @@ -526,8 +415,203 @@ def get_feature_names(self, input_features=None): return np.array(feature_names, dtype=object) + def _validate_keywords(self): + if self.handle_unknown not in ('error', 'ignore'): + msg = ("handle_unknown should be either 'error' or 'ignore', " + "got {0}.".format(self.handle_unknown)) + raise ValueError(msg) + # If we have both dropped columns and ignored unknown + # values, there will be ambiguous cells. This creates difficulties + # in interpreting the model. + if self.drop is not None and self.handle_unknown != 'error': + raise ValueError( + "`handle_unknown` must be 'error' when the drop parameter is " + "specified, as both would create categories that are all " + "zero.") -class OrdinalEncoder(_BaseEncoder): + def _check_drop(self, n_features): + if self.drop is None: + return [{'drop': None}] * n_features + if isinstance(self.drop, str) and self.drop == 'first': + return [{'drop_first': True}] * n_features + if not isinstance(self.drop, str): + try: + drops = np.asarray(self.drop, dtype=object) + drops_len = len(drops) + except (ValueError, TypeError): + msg = ("Wrong input for parameter `drop`. Expected " + "'first', None or array of objects, got {}") + raise ValueError(msg.format(type(self.drop))) + if drops_len != n_features: + msg = ("`drop` should have length equal to the number " + "of features ({}), got {}") + raise ValueError(msg.format(n_features, + len(self.drop))) + return [{'drop': drop} for drop in drops] + else: + msg = ("Wrong input for parameter `drop`. Expected " + "'first', None or array of objects, got {}") + raise ValueError(msg.format(type(self.drop))) + + +class _SingleOneHotEncoder(TransformerMixin, BaseEstimator): + """One hot encoder for a single categorical feature.""" + def __init__(self, categories='auto', drop=None, drop_first=False, + dtype=np.float64, handle_unknown='error', + sparse=True, feature_idx=0): + self.categories = categories + self.dtype = dtype + self.handle_unknown = handle_unknown + self.drop = drop + self.drop_first = drop_first + self.sparse = sparse + self.feature_idx = feature_idx + + def fit(self, X): + """Fit one hot encoder for a single categorical feature. + + Parameters + ---------- + X : ndarray of shape (n_samples,) + Categorical feature to encode. + + Returns + ------ + self + """ + if isinstance(self.categories, str) and self.categories == 'auto': + cats = _encode(X) + else: # categories were given + cats = np.array(self.categories, dtype=X.dtype) + if X.dtype != object and not np.all(np.sort(cats) == cats): + raise ValueError("Unsorted categories are not " + "supported for numerical categories") + + if self.handle_unknown == 'error': + diff = _encode_check_unknown(X, cats) + if diff: + msg = ("Found unknown categories {0} in column {1} " + "during fit".format(diff, self.feature_idx)) + raise ValueError(msg) + + self.categories_ = cats + + # compute drop idx + if self.drop_first: + self.drop_idx_ = 0 + elif self.drop is None: + self.drop_idx_ = None + elif self.drop not in self.categories_: + self.drop_idx_ = 'missing' + else: + self.drop_idx_ = np.where(self.categories_ == self.drop)[0][0] + + if self.drop_idx_ is not None: + self.n_features_out_ = len(self.categories_) - 1 + else: + self.n_features_out_ = len(self.categories_) + return self + + def transform(self, X): + """Transform a single categorical feature. + + Parameters + ---------- + X : ndarray of shape (n_samples,) + Categorical feature to encode. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Encoded feature. If `sparse=True` a sparse matrix is returned. + """ + diff, X_mask = _encode_check_unknown(X, self.categories_, + return_mask=True) + if not np.all(X_mask): + if self.handle_unknown == 'error': + msg = ("Found unknown categories {0} in column {1} " + "during transform".format(diff, self.feature_idx)) + raise ValueError(msg) + + # cast Xi into the largest string type necessary + # to handle different lengths of numpy strings + if (self.categories_.dtype.kind in ('U', 'S') + and self.categories_.itemsize > X.itemsize): + X = X.astype(self.categories_.dtype) + else: + X = X.copy() + X[~X_mask] = self.categories_[0] + + _, X_encoded = _encode(X, self.categories_, encode=True, + check_unknown=False) + + if self.drop_idx_ is not None: + keep_cells = X_encoded != self.drop_idx_ + X_mask &= keep_cells + + # adjust encoding to remove the dropped column + X_encoded[X_encoded > self.drop_idx_] -= 1 + + n_samples = X.shape[0] + X_mask_non_zero = np.flatnonzero(X_mask) + + out = sparse.lil_matrix((n_samples, self.n_features_out_), + dtype=self.dtype) + out[X_mask_non_zero, X_encoded[X_mask]] = 1 + + if self.sparse: + return out + else: + return out.toarray() + + def inverse_transform(self, X): + """Inverse transform to a single categorical feature. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples,) + The transformed data of a single feature. + + Returns + ------- + X_tr : ndarray of shape (n_samples, 1) + Inverse transform. + """ + # Only happens if there was a column with a unique + # category. In this case we just fill the column with this + # unique category value. + if self.n_features_out_ == 0: + value = self.categories_[self.drop_idx_] + return np.full((X.shape[0], 1), value) + + if self.drop_idx_ is None: + cats = self.categories_ + else: + cats = np.delete(self.categories_, self.drop_idx_) + + # for sparse X argmax returns 2D matrix, ensure 1D array + labels = np.asarray(_argmax(X, axis=1)).flatten() + X_tr = cats[labels] + if self.handle_unknown == 'ignore': + unknown = np.asarray(X.sum(axis=1) == 0).flatten() + # ignored unknown categories: we have a row of all zero + if unknown.any(): + if X_tr.dtype != object: + X_tr = X_tr.astype(object) + X_tr[unknown] = None + # drop will either be None or handle_unknown will be error. If + # self.drop is not None, then we can safely assume that all of + # the nulls in each column are the dropped value + elif self.drop_idx_ is not None: + dropped = np.asarray(X.sum(axis=1) == 0).flatten() + if dropped.any(): + X_tr[dropped] = self.categories_[self.drop_idx_] + + return X_tr[:, None] + + +class OrdinalEncoder(_EncoderUnion): """Encode categorical features as an integer array. The input to this transformer should be an array-like of integers or @@ -586,77 +670,154 @@ class OrdinalEncoder(_BaseEncoder): sklearn.preprocessing.LabelEncoder : encodes target labels with values between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64): self.categories = categories self.dtype = dtype + def _fit(self, X): + """Validate keywords and fit `X` and return `X_list`.""" + X_list = self._check_X(X) + categories = self._check_categories(len(X_list)) + + encoders = [_SingleOrdinalEncoder(categories=cat, + dtype=self.dtype, + feature_idx=idx) + for idx, cat in enumerate(categories)] + + self._fit_list(X_list, encoders) + return X_list + def fit(self, X, y=None): - """Fit the OrdinalEncoder to X. + """Fit OneHotEncoder to X. Parameters ---------- - X : array-like, shape [n_samples, n_features] - The data to determine the categories of each feature. + X : array-like of shape (n_samples, n_features) + The data to determine the categroies of each feature. Returns ------- self - """ self._fit(X) - return self def transform(self, X): - """Transform X to ordinal codes. + """Transform X to encoding Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like of shape (n_samples, n_features) The data to encode. Returns ------- - X_out : sparse matrix or a 2-d array - Transformed input. + X_out : ndarray of shape (n_samples, n_encoded_features) + Transformed array. + """ + check_is_fitted(self) + X_list = self._check_X(X) + return self._transform_list(X_list) + def fit_transform(self, X, y=None): + """Fit encoder to X and transform X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to encode. + + Returns + ------- + X_out : ndarray of shape (n_samples, n_encoded_features) + Transformed array. """ - X_int, _ = self._transform(X) - return X_int.astype(self.dtype, copy=False) + X_list = self._fit(X) + return self._transform_list(X_list) - def inverse_transform(self, X): - """Convert the data back to the original representation. + +class _SingleOrdinalEncoder(TransformerMixin, BaseEstimator): + """Ordinal Encoder for a single categorical feature.""" + def __init__(self, categories='auto', dtype=np.float64, feature_idx=0): + self.categories = categories + self.dtype = dtype + self.feature_idx = feature_idx + + def fit(self, X, y=None): + """Fit ordinal encoder on a single categorical feature. Parameters ---------- - X : array-like or sparse matrix, shape [n_samples, n_encoded_features] - The transformed data. + X : ndarray of shape (n_samples,) + Categorical feature to encode. Returns ------- - X_tr : array-like, shape [n_samples, n_features] - Inverse transformed array. + self + """ + if isinstance(self.categories, str) and self.categories == 'auto': + cats = _encode(X) + else: # categories were given + cats = np.array(self.categories, dtype=X.dtype) + if X.dtype != object and not np.all(np.sort(cats) == cats): + raise ValueError("Unsorted categories are not " + "supported for numerical categories") + diff = _encode_check_unknown(X, cats) + if diff: + msg = ("Found unknown categories {0} in column {1} " + "during fit".format(diff, self.feature_idx)) + raise ValueError(msg) + + self.categories_ = cats + self.n_features_out_ = 1 + return self + + def transform(self, X): + """Transform on an single categorical feature. + Parameters + ---------- + X : ndarray of shape (n_samples,) + Categorical feature to encode. + + Returns + ------- + X_tr : ndarray of shape (n_samples, 1) + Encoded categorical feature. """ - check_is_fitted(self) - X = check_array(X, accept_sparse='csr') + diff, valid_mask = _encode_check_unknown(X, self.categories_, + return_mask=True) + if not np.all(valid_mask): + if self.handle_unknown == 'error': + msg = ("Found unknown categories {0} in column {1} " + "during transform".format(diff, self.feature_idx)) + raise ValueError(msg) - n_samples, _ = X.shape - n_features = len(self.categories_) + # cast Xi into the largest string type necessary + # to handle different lengths of numpy strings + if (self.categories_.dtype.kind in ('U', 'S') + and self.categories_.itemsize > X.itemsize): + X = X.astype(self.categories_.dtype) + else: + X = X.copy() + X[~valid_mask] = self.categories_[0] - # validate shape of passed X - msg = ("Shape of the passed X data is not correct. Expected {0} " - "columns, got {1}.") - if X.shape[1] != n_features: - raise ValueError(msg.format(n_features, X.shape[1])) + _, encoded = _encode(X, self.categories_, encode=True, + check_unknown=False) + return encoded[:, None].astype(self.dtype) - # create resulting array of appropriate dtype - dt = np.find_common_type([cat.dtype for cat in self.categories_], []) - X_tr = np.empty((n_samples, n_features), dtype=dt) + def inverse_transform(self, X): + """Convert the data back into the original representation. - for i in range(n_features): - labels = X[:, i].astype('int64', copy=False) - X_tr[:, i] = self.categories_[i][labels] + Parameters + ---------- + X : ndarray of shape (n_samples,) + Transformed data. - return X_tr + Returns + ------- + X_tr : ndarry of shape (n_samples, 1) + Inverse transformed array. + """ + labels = X.astype('int64', copy=False) + return self.categories_[labels] From 7cb752c469ebec402dc29984b36a8a2066999e7f Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 12 Sep 2019 19:54:24 -0400 Subject: [PATCH 2/9] DOC Add note regarding onehotencoder --- sklearn/preprocessing/_encoders.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index d4609553f11ad..cd0bcba49ffb2 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -455,7 +455,12 @@ def _check_drop(self, n_features): class _SingleOneHotEncoder(TransformerMixin, BaseEstimator): - """One hot encoder for a single categorical feature.""" + """One hot encoder for a single categorical feature. + + When calling `fit`, the attribute `drop_idx_` will be set to 'missing' + when the drop category is not found in the dataset or given by + categories. `drop_idx_` should be checked be the caller to make sure + the encoder is valid.""" def __init__(self, categories='auto', drop=None, drop_first=False, dtype=np.float64, handle_unknown='error', sparse=True, feature_idx=0): @@ -502,6 +507,8 @@ def fit(self, X): elif self.drop is None: self.drop_idx_ = None elif self.drop not in self.categories_: + # This is an error state. Caller should check this value and + # handle according. self.drop_idx_ = 'missing' else: self.drop_idx_ = np.where(self.categories_ == self.drop)[0][0] From 7a2f01dc220418ff62a40207b7a0a673af43bc80 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 12 Sep 2019 20:20:52 -0400 Subject: [PATCH 3/9] CLN Copy only when needed --- sklearn/preprocessing/_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index cd0bcba49ffb2..c8b1b99754114 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -811,7 +811,7 @@ def transform(self, X): _, encoded = _encode(X, self.categories_, encode=True, check_unknown=False) - return encoded[:, None].astype(self.dtype) + return encoded[:, None].astype(self.dtype, copy=False) def inverse_transform(self, X): """Convert the data back into the original representation. From c5d3351b3898b0c094279ace7a8aebc6ea434b64 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 12 Sep 2019 22:26:59 -0400 Subject: [PATCH 4/9] BUG Fix linux py35 bug --- sklearn/preprocessing/_encoders.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index c8b1b99754114..bd394ab171fa1 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -119,7 +119,8 @@ def _transform_list(self, X_list): X_trs = [] for encoder, X_col in zip(self._encoders, X_list): - X_trs.append(encoder.transform(X_col)) + if encoder.n_features_out_ != 0: + X_trs.append(encoder.transform(X_col)) return self._hstack(X_trs) def _hstack(self, Xs): From ebc4c741c1f582199902d6b1003131a73624cade Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 12 Sep 2019 22:32:28 -0400 Subject: [PATCH 5/9] BUG Fix Windows 32 bug --- sklearn/preprocessing/_encoders.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index bd394ab171fa1..e2185af3c4414 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -293,7 +293,8 @@ def __init__(self, categories='auto', drop=None, sparse=True, @property def drop_idx_(self): - return np.array([encoder.drop_idx_ for encoder in self._encoders]) + return np.array([encoder.drop_idx_ for encoder in self._encoders], + dtype=np.int_) def _fit(self, X): """Validate keywords and fit `X` and return `X_list`.""" @@ -407,11 +408,11 @@ def get_feature_names(self, input_features=None): len(input_features))) feature_names = [] - for i in range(len(cats)): - names = [ - input_features[i] + '_' + str(t) for t in cats[i]] + for input_feat, cat, encoder in zip(input_features, cats, + self._encoders): + names = [input_feat + '_' + str(t) for t in cat] if self.drop is not None: - names.pop(self.drop_idx_[i]) + names.pop(encoder.drop_idx_) feature_names.extend(names) return np.array(feature_names, dtype=object) From dcf25043a4ef791abf2ab35d196f550fdfce62a0 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Fri, 13 Sep 2019 11:07:15 -0400 Subject: [PATCH 6/9] CLN Address @glemaitre comments --- sklearn/preprocessing/_encoders.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index e2185af3c4414..3252f70252fbc 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -61,10 +61,9 @@ def _check_X(self, X): # to keep the dtype information to be used in the encoder. needs_validation = True - n_features = X.shape[1] X_columns = [] - for i in range(n_features): + for i in range(X.shape[1]): Xi = safe_indexing(X, i, axis=1) Xi = check_array(Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation) @@ -90,12 +89,10 @@ def _check_categories(self, n_features): def _fit_list(self, X_list, encoders): """Fit encoders on X_list""" - assert len(X_list) == len(encoders) - for X_col, encoder in zip(X_list, encoders): encoder.fit(X_col) - # map from X_trans indicies to indicies from the original X + # map from X_trans indices to indices from the original X X_trans_to_orig_idx = [] X_trans_idx = 0 for encoder in encoders: @@ -115,7 +112,8 @@ def _transform_list(self, X_list): "The number of features in X is different to the number of " "features of the fitted data. The fitted data had {} features " "and the X has {} features." - .format(len(self.categories_,), n_features)) + .format(len(self.categories_,), n_features) + ) X_trs = [] for encoder, X_col in zip(self._encoders, X_list): From 61903fd8686bf924bf04b3a972d43dc261c3ff80 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 19 Sep 2019 15:05:44 -0400 Subject: [PATCH 7/9] BUG Improves performance --- sklearn/preprocessing/_encoders.py | 95 +++++++++++++++--------------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 3252f70252fbc..daa12a9d20f79 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -22,8 +22,8 @@ class _EncoderUnion(TransformerMixin, BaseEstimator): - """Base class for encoders that includes the code to categorize and - transform the input features. + """Base class for encoders that includes the code to encode and + transform the input features one by one. The encoders passed to `_fit_list` must define: @@ -34,7 +34,7 @@ class _EncoderUnion(TransformerMixin, BaseEstimator): @property def categories_(self): - return [encoder.categories_ for encoder in self._encoders] + return [encoder.categories_ for encoder in self._single_encoders] def _check_X(self, X): """Perform custom check_array: @@ -87,22 +87,23 @@ def _check_categories(self, n_features): return categories - def _fit_list(self, X_list, encoders): - """Fit encoders on X_list""" - for X_col, encoder in zip(X_list, encoders): + def _fit_list(self, X_list, single_encoders): + """Fit single_encoders on X_list""" + for X_col, encoder in zip(X_list, single_encoders): encoder.fit(X_col) - # map from X_trans indices to indices from the original X - X_trans_to_orig_idx = [] + # maps indices from original X to indices in the transformed X + # this is used in inverse_transform + orig_idx_to_X_trans_idx = [] X_trans_idx = 0 - for encoder in encoders: - n_feat_out = encoder.n_features_out_ - begin, end = X_trans_idx, X_trans_idx + n_feat_out - X_trans_to_orig_idx.append((begin, end)) - X_trans_idx += n_feat_out + for encoder in single_encoders: + n_features_out = encoder.n_features_out_ + begin, end = X_trans_idx, X_trans_idx + n_features_out + orig_idx_to_X_trans_idx.append((begin, end)) + X_trans_idx += n_features_out - self._X_trans_to_orig_idx = X_trans_to_orig_idx - self._encoders = encoders + self._orig_idx_to_X_trans_idx = orig_idx_to_X_trans_idx + self._single_encoders = single_encoders def _transform_list(self, X_list): """Transform X_list with encoders""" @@ -115,18 +116,18 @@ def _transform_list(self, X_list): .format(len(self.categories_,), n_features) ) - X_trs = [] - for encoder, X_col in zip(self._encoders, X_list): + X_trans = [] + for encoder, X_col in zip(self._single_encoders, X_list): if encoder.n_features_out_ != 0: - X_trs.append(encoder.transform(X_col)) - return self._hstack(X_trs) + X_trans.append(encoder.transform(X_col)) + return self._hstack(X_trans) - def _hstack(self, Xs): - if any(sparse.issparse(X_tran) for X_tran in Xs): - Xs = sparse.hstack(Xs).tocsr() + def _hstack(self, X_trans): + if any(sparse.issparse(X_tran) for X_tran in X_trans): + X_trans_stacked = sparse.hstack(X_trans).tocsr() else: - Xs = np.hstack(Xs) - return Xs + X_trans_stacked = np.hstack(X_trans) + return X_trans_stacked def inverse_transform(self, X): """Convert the data back into the original representation. @@ -145,7 +146,8 @@ def inverse_transform(self, X): check_is_fitted(self) X = check_array(X, accept_sparse='csr') - n_features = sum(encoder.n_features_out_ for encoder in self._encoders) + n_features = sum(encoder.n_features_out_ for encoder in + self._single_encoders) # validate shape of passed X msg = ("Shape of the passed X data is not correct. Expected {0} " @@ -154,8 +156,8 @@ def inverse_transform(self, X): raise ValueError(msg.format(n_features, X.shape[1])) X_trs = [] - for encoder, (begin, end) in zip(self._encoders, - self._X_trans_to_orig_idx): + for encoder, (begin, end) in zip(self._single_encoders, + self._orig_idx_to_X_trans_idx): X_slice = safe_indexing(X, slice(begin, end), axis=1) X_trs.append(encoder.inverse_transform(X_slice)) return self._hstack(X_trs) @@ -291,8 +293,8 @@ def __init__(self, categories='auto', drop=None, sparse=True, @property def drop_idx_(self): - return np.array([encoder.drop_idx_ for encoder in self._encoders], - dtype=np.int_) + return np.array([encoder.drop_idx_ for encoder in + self._single_encoders], dtype=np.int_) def _fit(self, X): """Validate keywords and fit `X` and return `X_list`.""" @@ -312,14 +314,14 @@ def _fit(self, X): **drop_kwarg) for idx, cat, drop_kwarg in zip(count(), categories, drop_kwargs)] - self._fit_list(X_list, encoders) + super()._fit_list(X_list, encoders) # validate encoders missing_drops = [] for idx, encoder in enumerate(encoders): drop_idx = encoder.drop_idx_ if isinstance(drop_idx, str) and drop_idx == 'missing': - missing_drops.append((idx, encoder.drop)) + missing_drops.append((idx, encoder.drop_category)) if any(missing_drops): msg = ("The following categories were supposed to be " @@ -362,7 +364,7 @@ def transform(self, X): """ check_is_fitted(self) X_list = self._check_X(X) - return self._transform_list(X_list) + return super()._transform_list(X_list) def fit_transform(self, X, y=None): """Fit encoder to X and transform X. @@ -379,7 +381,7 @@ def fit_transform(self, X, y=None): Transformed array. When `sparse=True`, `X_out` is a sparse array. """ X_list = self._fit(X) - return self._transform_list(X_list) + return super()._transform_list(X_list) def get_feature_names(self, input_features=None): """Return feature names for output features. @@ -407,7 +409,7 @@ def get_feature_names(self, input_features=None): feature_names = [] for input_feat, cat, encoder in zip(input_features, cats, - self._encoders): + self._single_encoders): names = [input_feat + '_' + str(t) for t in cat] if self.drop is not None: names.pop(encoder.drop_idx_) @@ -431,7 +433,7 @@ def _validate_keywords(self): def _check_drop(self, n_features): if self.drop is None: - return [{'drop': None}] * n_features + return [{'drop_category': None}] * n_features if isinstance(self.drop, str) and self.drop == 'first': return [{'drop_first': True}] * n_features if not isinstance(self.drop, str): @@ -447,7 +449,7 @@ def _check_drop(self, n_features): "of features ({}), got {}") raise ValueError(msg.format(n_features, len(self.drop))) - return [{'drop': drop} for drop in drops] + return [{'drop_category': drop} for drop in drops] else: msg = ("Wrong input for parameter `drop`. Expected " "'first', None or array of objects, got {}") @@ -461,13 +463,13 @@ class _SingleOneHotEncoder(TransformerMixin, BaseEstimator): when the drop category is not found in the dataset or given by categories. `drop_idx_` should be checked be the caller to make sure the encoder is valid.""" - def __init__(self, categories='auto', drop=None, drop_first=False, + def __init__(self, categories='auto', drop_category=None, drop_first=False, dtype=np.float64, handle_unknown='error', sparse=True, feature_idx=0): self.categories = categories self.dtype = dtype self.handle_unknown = handle_unknown - self.drop = drop + self.drop_category = drop_category self.drop_first = drop_first self.sparse = sparse self.feature_idx = feature_idx @@ -504,14 +506,15 @@ def fit(self, X): # compute drop idx if self.drop_first: self.drop_idx_ = 0 - elif self.drop is None: + elif self.drop_category is None: self.drop_idx_ = None - elif self.drop not in self.categories_: + elif self.drop_category not in self.categories_: # This is an error state. Caller should check this value and - # handle according. + # handle accordingly. self.drop_idx_ = 'missing' else: - self.drop_idx_ = np.where(self.categories_ == self.drop)[0][0] + self.drop_idx_ = np.where(self.categories_ == + self.drop_category)[0][0] if self.drop_idx_ is not None: self.n_features_out_ = len(self.categories_) - 1 @@ -563,7 +566,7 @@ def transform(self, X): n_samples = X.shape[0] X_mask_non_zero = np.flatnonzero(X_mask) - out = sparse.lil_matrix((n_samples, self.n_features_out_), + out = sparse.csr_matrix((n_samples, self.n_features_out_), dtype=self.dtype) out[X_mask_non_zero, X_encoded[X_mask]] = 1 @@ -691,7 +694,7 @@ def _fit(self, X): feature_idx=idx) for idx, cat in enumerate(categories)] - self._fit_list(X_list, encoders) + super()._fit_list(X_list, encoders) return X_list def fit(self, X, y=None): @@ -724,7 +727,7 @@ def transform(self, X): """ check_is_fitted(self) X_list = self._check_X(X) - return self._transform_list(X_list) + return super()._transform_list(X_list) def fit_transform(self, X, y=None): """Fit encoder to X and transform X. @@ -740,7 +743,7 @@ def fit_transform(self, X, y=None): Transformed array. """ X_list = self._fit(X) - return self._transform_list(X_list) + return super()._transform_list(X_list) class _SingleOrdinalEncoder(TransformerMixin, BaseEstimator): From 93c594bc69ee97916b67977f82438679ab79de28 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 19 Sep 2019 15:22:56 -0400 Subject: [PATCH 8/9] ENH Matches performance with master --- sklearn/preprocessing/_encoders.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index daa12a9d20f79..5778fd3f1b84c 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -564,12 +564,11 @@ def transform(self, X): X_encoded[X_encoded > self.drop_idx_] -= 1 n_samples = X.shape[0] - X_mask_non_zero = np.flatnonzero(X_mask) - - out = sparse.csr_matrix((n_samples, self.n_features_out_), - dtype=self.dtype) - out[X_mask_non_zero, X_encoded[X_mask]] = 1 - + row = np.arange(0, n_samples)[X_mask] + col = X_encoded[X_mask] + data = np.ones(np.sum(X_mask), dtype=self.dtype) + out = sparse.csr_matrix((data, (row, col)), + shape=(n_samples, self.n_features_out_)) if self.sparse: return out else: From 56b4db68b04a0e9dab9b886ce5eded1b311950a7 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 25 Sep 2019 10:35:19 -0400 Subject: [PATCH 9/9] ENH Minor optimizations --- sklearn/preprocessing/_encoders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 5778fd3f1b84c..17917486c3ad4 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -564,10 +564,10 @@ def transform(self, X): X_encoded[X_encoded > self.drop_idx_] -= 1 n_samples = X.shape[0] - row = np.arange(0, n_samples)[X_mask] + row = np.flatnonzero(X_mask) col = X_encoded[X_mask] data = np.ones(np.sum(X_mask), dtype=self.dtype) - out = sparse.csr_matrix((data, (row, col)), + out = sparse.csc_matrix((data, (row, col)), shape=(n_samples, self.n_features_out_)) if self.sparse: return out