diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 0ee5d32720e63..17917486c3ad4 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -4,9 +4,11 @@ import numpy as np from scipy import sparse +from itertools import count from ..base import BaseEstimator, TransformerMixin from ..utils import check_array +from ..utils import safe_indexing from ..utils.fixes import _argmax from ..utils.validation import check_is_fitted @@ -19,16 +21,23 @@ ] -class _BaseEncoder(TransformerMixin, BaseEstimator): - """ - Base class for encoders that includes the code to categorize and - transform the input features. +class _EncoderUnion(TransformerMixin, BaseEstimator): + """Base class for encoders that includes the code to encode and + transform the input features one by one. + + The encoders passed to `_fit_list` must define: + 1. `fit(X)` where `X` is a ndarray of shape (n_samples,) + 2. `transform(X)` where `X` is a ndarray of shape (n_samples,) + 3. `inverse_transform(X)` where `X` is an encoded ndarray of sparse array """ + @property + def categories_(self): + return [encoder.categories_ for encoder in self._single_encoders] + def _check_X(self, X): - """ - Perform custom check_array: + """Perform custom check_array: - convert list of strings to object dtype - check for missing values for object dtype data (check_array does not do that) @@ -52,58 +61,53 @@ def _check_X(self, X): # to keep the dtype information to be used in the encoder. needs_validation = True - n_samples, n_features = X.shape X_columns = [] - for i in range(n_features): - Xi = self._get_feature(X, feature_idx=i) + for i in range(X.shape[1]): + Xi = safe_indexing(X, i, axis=1) Xi = check_array(Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation) X_columns.append(Xi) - return X_columns, n_samples, n_features + return X_columns - def _get_feature(self, X, feature_idx): - if hasattr(X, 'iloc'): - # pandas dataframes - return X.iloc[:, feature_idx] - # numpy arrays, sparse arrays - return X[:, feature_idx] + def _check_categories(self, n_features): + """Check categories are consistent with n_features""" + categories = self.categories - def _fit(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) - - if self.categories != 'auto': - if len(self.categories) != n_features: - raise ValueError("Shape mismatch: if categories is an array," - " it has to be of shape (n_features,).") - - self.categories_ = [] - - for i in range(n_features): - Xi = X_list[i] - if self.categories == 'auto': - cats = _encode(Xi) - else: - cats = np.array(self.categories[i], dtype=Xi.dtype) - if Xi.dtype != object: - if not np.all(np.sort(cats) == cats): - raise ValueError("Unsorted categories are not " - "supported for numerical categories") - if handle_unknown == 'error': - diff = _encode_check_unknown(Xi, cats) - if diff: - msg = ("Found unknown categories {0} in column {1}" - " during fit".format(diff, i)) - raise ValueError(msg) - self.categories_.append(cats) - - def _transform(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) - - X_int = np.zeros((n_samples, n_features), dtype=np.int) - X_mask = np.ones((n_samples, n_features), dtype=np.bool) + if (self.categories != 'auto' and + len(self.categories) != n_features): + raise ValueError("Shape mismatch: if categories is an array, " + "it has to be of shape (n_features,).") + if isinstance(self.categories, str) and self.categories == 'auto': + categories = ['auto'] * n_features + else: + categories = self.categories + + return categories + + def _fit_list(self, X_list, single_encoders): + """Fit single_encoders on X_list""" + for X_col, encoder in zip(X_list, single_encoders): + encoder.fit(X_col) + + # maps indices from original X to indices in the transformed X + # this is used in inverse_transform + orig_idx_to_X_trans_idx = [] + X_trans_idx = 0 + for encoder in single_encoders: + n_features_out = encoder.n_features_out_ + begin, end = X_trans_idx, X_trans_idx + n_features_out + orig_idx_to_X_trans_idx.append((begin, end)) + X_trans_idx += n_features_out + + self._orig_idx_to_X_trans_idx = orig_idx_to_X_trans_idx + self._single_encoders = single_encoders + + def _transform_list(self, X_list): + """Transform X_list with encoders""" + n_features = len(X_list) if n_features != len(self.categories_): raise ValueError( "The number of features in X is different to the number of " @@ -112,43 +116,57 @@ def _transform(self, X, handle_unknown='error'): .format(len(self.categories_,), n_features) ) - for i in range(n_features): - Xi = X_list[i] - diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], - return_mask=True) + X_trans = [] + for encoder, X_col in zip(self._single_encoders, X_list): + if encoder.n_features_out_ != 0: + X_trans.append(encoder.transform(X_col)) + return self._hstack(X_trans) - if not np.all(valid_mask): - if handle_unknown == 'error': - msg = ("Found unknown categories {0} in column {1}" - " during transform".format(diff, i)) - raise ValueError(msg) - else: - # Set the problematic rows to an acceptable value and - # continue `The rows are marked `X_mask` and will be - # removed later. - X_mask[:, i] = valid_mask - # cast Xi into the largest string type necessary - # to handle different lengths of numpy strings - if (self.categories_[i].dtype.kind in ('U', 'S') - and self.categories_[i].itemsize > Xi.itemsize): - Xi = Xi.astype(self.categories_[i].dtype) - else: - Xi = Xi.copy() - - Xi[~valid_mask] = self.categories_[i][0] - # We use check_unknown=False, since _encode_check_unknown was - # already called above. - _, encoded = _encode(Xi, self.categories_[i], encode=True, - check_unknown=False) - X_int[:, i] = encoded - - return X_int, X_mask + def _hstack(self, X_trans): + if any(sparse.issparse(X_tran) for X_tran in X_trans): + X_trans_stacked = sparse.hstack(X_trans).tocsr() + else: + X_trans_stacked = np.hstack(X_trans) + return X_trans_stacked + + def inverse_transform(self, X): + """Convert the data back into the original representation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape \ + (n_samples, n_encoded_features) + The transformed data. + + Returns + ------- + X_tr : ndarray of shape (n_samples, n_features) + Inverse transformed array. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse='csr') + + n_features = sum(encoder.n_features_out_ for encoder in + self._single_encoders) + + # validate shape of passed X + msg = ("Shape of the passed X data is not correct. Expected {0} " + "columns, got {1}.") + if X.shape[1] != n_features: + raise ValueError(msg.format(n_features, X.shape[1])) + + X_trs = [] + for encoder, (begin, end) in zip(self._single_encoders, + self._orig_idx_to_X_trans_idx): + X_slice = safe_indexing(X, slice(begin, end), axis=1) + X_trs.append(encoder.inverse_transform(X_slice)) + return self._hstack(X_trs) def _more_tags(self): return {'X_types': ['categorical']} -class OneHotEncoder(_BaseEncoder): +class OneHotEncoder(_EncoderUnion): """Encode categorical features as a one-hot numeric array. The input to this transformer should be an array-like of integers or @@ -264,233 +282,106 @@ class OneHotEncoder(_BaseEncoder): iterables and a multilabel format, e.g. a (samples x classes) binary matrix indicating the presence of a class label. """ - def __init__(self, categories='auto', drop=None, sparse=True, dtype=np.float64, handle_unknown='error'): + self.categories = categories self.sparse = sparse self.dtype = dtype self.handle_unknown = handle_unknown self.drop = drop - def _validate_keywords(self): - if self.handle_unknown not in ('error', 'ignore'): - msg = ("handle_unknown should be either 'error' or 'ignore', " - "got {0}.".format(self.handle_unknown)) + @property + def drop_idx_(self): + return np.array([encoder.drop_idx_ for encoder in + self._single_encoders], dtype=np.int_) + + def _fit(self, X): + """Validate keywords and fit `X` and return `X_list`.""" + self._validate_keywords() + X_list = self._check_X(X) + n_features = len(X_list) + + categories = self._check_categories(n_features) + drop_kwargs = self._check_drop(n_features) + + encoders = [ + _SingleOneHotEncoder(categories=cat, + dtype=self.dtype, + handle_unknown=self.handle_unknown, + sparse=self.sparse, + feature_idx=idx, + **drop_kwarg) + for idx, cat, drop_kwarg in zip(count(), categories, drop_kwargs)] + + super()._fit_list(X_list, encoders) + + # validate encoders + missing_drops = [] + for idx, encoder in enumerate(encoders): + drop_idx = encoder.drop_idx_ + if isinstance(drop_idx, str) and drop_idx == 'missing': + missing_drops.append((idx, encoder.drop_category)) + + if any(missing_drops): + msg = ("The following categories were supposed to be " + "dropped, but were not found in the training " + "data.\n{}".format("\n".join( + ["Category: {}, Feature: {}".format(c, v) + for c, v in missing_drops]))) raise ValueError(msg) - # If we have both dropped columns and ignored unknown - # values, there will be ambiguous cells. This creates difficulties - # in interpreting the model. - if self.drop is not None and self.handle_unknown != 'error': - raise ValueError( - "`handle_unknown` must be 'error' when the drop parameter is " - "specified, as both would create categories that are all " - "zero.") - def _compute_drop_idx(self): - if self.drop is None: - return None - elif (isinstance(self.drop, str) and self.drop == 'first'): - return np.zeros(len(self.categories_), dtype=np.int_) - elif not isinstance(self.drop, str): - try: - self.drop = np.asarray(self.drop, dtype=object) - droplen = len(self.drop) - except (ValueError, TypeError): - msg = ("Wrong input for parameter `drop`. Expected " - "'first', None or array of objects, got {}") - raise ValueError(msg.format(type(self.drop))) - if droplen != len(self.categories_): - msg = ("`drop` should have length equal to the number " - "of features ({}), got {}") - raise ValueError(msg.format(len(self.categories_), - len(self.drop))) - missing_drops = [(i, val) for i, val in enumerate(self.drop) - if val not in self.categories_[i]] - if any(missing_drops): - msg = ("The following categories were supposed to be " - "dropped, but were not found in the training " - "data.\n{}".format( - "\n".join( - ["Category: {}, Feature: {}".format(c, v) - for c, v in missing_drops]))) - raise ValueError(msg) - return np.array([np.where(cat_list == val)[0][0] - for (val, cat_list) in - zip(self.drop, self.categories_)], dtype=np.int_) - else: - msg = ("Wrong input for parameter `drop`. Expected " - "'first', None or array of objects, got {}") - raise ValueError(msg.format(type(self.drop))) + return X_list def fit(self, X, y=None): """Fit OneHotEncoder to X. Parameters ---------- - X : array-like, shape [n_samples, n_features] - The data to determine the categories of each feature. + X : array-like of shape (n_samples, n_features) + The data to determine the categroies of each feature. Returns ------- self """ - self._validate_keywords() - self._fit(X, handle_unknown=self.handle_unknown) - self.drop_idx_ = self._compute_drop_idx() + self._fit(X) return self - def fit_transform(self, X, y=None): - """Fit OneHotEncoder to X, then transform X. - - Equivalent to fit(X).transform(X) but more convenient. - - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - The data to encode. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. - """ - self._validate_keywords() - return super().fit_transform(X, y) - def transform(self, X): - """Transform X using one-hot encoding. + """Transform X to encoding Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like of shape (n_samples, n_features) The data to encode. Returns ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. + X_out : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Transformed array. When `sparse=True`, `X_out` is a sparse array. """ check_is_fitted(self) - # validation of X happens in _check_X called by _transform - X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) - - n_samples, n_features = X_int.shape - - if self.drop is not None: - to_drop = self.drop_idx_.reshape(1, -1) + X_list = self._check_X(X) + return super()._transform_list(X_list) - # We remove all the dropped categories from mask, and decrement all - # categories that occur after them to avoid an empty column. - - keep_cells = X_int != to_drop - X_mask &= keep_cells - X_int[X_int > to_drop] -= 1 - n_values = [len(cats) - 1 for cats in self.categories_] - else: - n_values = [len(cats) for cats in self.categories_] - - mask = X_mask.ravel() - n_values = np.array([0] + n_values) - feature_indices = np.cumsum(n_values) - indices = (X_int + feature_indices[:-1]).ravel()[mask] - indptr = X_mask.sum(axis=1).cumsum() - indptr = np.insert(indptr, 0, 0) - data = np.ones(n_samples * n_features)[mask] - - out = sparse.csr_matrix((data, indices, indptr), - shape=(n_samples, feature_indices[-1]), - dtype=self.dtype) - if not self.sparse: - return out.toarray() - else: - return out - - def inverse_transform(self, X): - """Convert the back data to the original representation. - - In case unknown categories are encountered (all zeros in the - one-hot encoding), ``None`` is used to represent this category. + def fit_transform(self, X, y=None): + """Fit encoder to X and transform X. Parameters ---------- - X : array-like or sparse matrix, shape [n_samples, n_encoded_features] - The transformed data. + X : array-like of shape (n_samples, n_features) + The data to encode. Returns ------- - X_tr : array-like, shape [n_samples, n_features] - Inverse transformed array. - + X_out : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Transformed array. When `sparse=True`, `X_out` is a sparse array. """ - check_is_fitted(self) - X = check_array(X, accept_sparse='csr') - - n_samples, _ = X.shape - n_features = len(self.categories_) - if self.drop is None: - n_transformed_features = sum(len(cats) - for cats in self.categories_) - else: - n_transformed_features = sum(len(cats) - 1 - for cats in self.categories_) - - # validate shape of passed X - msg = ("Shape of the passed X data is not correct. Expected {0} " - "columns, got {1}.") - if X.shape[1] != n_transformed_features: - raise ValueError(msg.format(n_transformed_features, X.shape[1])) - - # create resulting array of appropriate dtype - dt = np.find_common_type([cat.dtype for cat in self.categories_], []) - X_tr = np.empty((n_samples, n_features), dtype=dt) - - j = 0 - found_unknown = {} - - for i in range(n_features): - if self.drop is None: - cats = self.categories_[i] - else: - cats = np.delete(self.categories_[i], self.drop_idx_[i]) - n_categories = len(cats) - - # Only happens if there was a column with a unique - # category. In this case we just fill the column with this - # unique category value. - if n_categories == 0: - X_tr[:, i] = self.categories_[i][self.drop_idx_[i]] - j += n_categories - continue - sub = X[:, j:j + n_categories] - # for sparse X argmax returns 2D matrix, ensure 1D array - labels = np.asarray(_argmax(sub, axis=1)).flatten() - X_tr[:, i] = cats[labels] - if self.handle_unknown == 'ignore': - unknown = np.asarray(sub.sum(axis=1) == 0).flatten() - # ignored unknown categories: we have a row of all zero - if unknown.any(): - found_unknown[i] = unknown - # drop will either be None or handle_unknown will be error. If - # self.drop is not None, then we can safely assume that all of - # the nulls in each column are the dropped value - elif self.drop is not None: - dropped = np.asarray(sub.sum(axis=1) == 0).flatten() - if dropped.any(): - X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]] - - j += n_categories - - # if ignored are found: potentially need to upcast result to - # insert None values - if found_unknown: - if X_tr.dtype != object: - X_tr = X_tr.astype(object) - - for idx, mask in found_unknown.items(): - X_tr[mask, idx] = None - - return X_tr + X_list = self._fit(X) + return super()._transform_list(X_list) def get_feature_names(self, input_features=None): """Return feature names for output features. @@ -517,17 +408,219 @@ def get_feature_names(self, input_features=None): len(input_features))) feature_names = [] - for i in range(len(cats)): - names = [ - input_features[i] + '_' + str(t) for t in cats[i]] + for input_feat, cat, encoder in zip(input_features, cats, + self._single_encoders): + names = [input_feat + '_' + str(t) for t in cat] if self.drop is not None: - names.pop(self.drop_idx_[i]) + names.pop(encoder.drop_idx_) feature_names.extend(names) return np.array(feature_names, dtype=object) + def _validate_keywords(self): + if self.handle_unknown not in ('error', 'ignore'): + msg = ("handle_unknown should be either 'error' or 'ignore', " + "got {0}.".format(self.handle_unknown)) + raise ValueError(msg) + # If we have both dropped columns and ignored unknown + # values, there will be ambiguous cells. This creates difficulties + # in interpreting the model. + if self.drop is not None and self.handle_unknown != 'error': + raise ValueError( + "`handle_unknown` must be 'error' when the drop parameter is " + "specified, as both would create categories that are all " + "zero.") -class OrdinalEncoder(_BaseEncoder): + def _check_drop(self, n_features): + if self.drop is None: + return [{'drop_category': None}] * n_features + if isinstance(self.drop, str) and self.drop == 'first': + return [{'drop_first': True}] * n_features + if not isinstance(self.drop, str): + try: + drops = np.asarray(self.drop, dtype=object) + drops_len = len(drops) + except (ValueError, TypeError): + msg = ("Wrong input for parameter `drop`. Expected " + "'first', None or array of objects, got {}") + raise ValueError(msg.format(type(self.drop))) + if drops_len != n_features: + msg = ("`drop` should have length equal to the number " + "of features ({}), got {}") + raise ValueError(msg.format(n_features, + len(self.drop))) + return [{'drop_category': drop} for drop in drops] + else: + msg = ("Wrong input for parameter `drop`. Expected " + "'first', None or array of objects, got {}") + raise ValueError(msg.format(type(self.drop))) + + +class _SingleOneHotEncoder(TransformerMixin, BaseEstimator): + """One hot encoder for a single categorical feature. + + When calling `fit`, the attribute `drop_idx_` will be set to 'missing' + when the drop category is not found in the dataset or given by + categories. `drop_idx_` should be checked be the caller to make sure + the encoder is valid.""" + def __init__(self, categories='auto', drop_category=None, drop_first=False, + dtype=np.float64, handle_unknown='error', + sparse=True, feature_idx=0): + self.categories = categories + self.dtype = dtype + self.handle_unknown = handle_unknown + self.drop_category = drop_category + self.drop_first = drop_first + self.sparse = sparse + self.feature_idx = feature_idx + + def fit(self, X): + """Fit one hot encoder for a single categorical feature. + + Parameters + ---------- + X : ndarray of shape (n_samples,) + Categorical feature to encode. + + Returns + ------ + self + """ + if isinstance(self.categories, str) and self.categories == 'auto': + cats = _encode(X) + else: # categories were given + cats = np.array(self.categories, dtype=X.dtype) + if X.dtype != object and not np.all(np.sort(cats) == cats): + raise ValueError("Unsorted categories are not " + "supported for numerical categories") + + if self.handle_unknown == 'error': + diff = _encode_check_unknown(X, cats) + if diff: + msg = ("Found unknown categories {0} in column {1} " + "during fit".format(diff, self.feature_idx)) + raise ValueError(msg) + + self.categories_ = cats + + # compute drop idx + if self.drop_first: + self.drop_idx_ = 0 + elif self.drop_category is None: + self.drop_idx_ = None + elif self.drop_category not in self.categories_: + # This is an error state. Caller should check this value and + # handle accordingly. + self.drop_idx_ = 'missing' + else: + self.drop_idx_ = np.where(self.categories_ == + self.drop_category)[0][0] + + if self.drop_idx_ is not None: + self.n_features_out_ = len(self.categories_) - 1 + else: + self.n_features_out_ = len(self.categories_) + return self + + def transform(self, X): + """Transform a single categorical feature. + + Parameters + ---------- + X : ndarray of shape (n_samples,) + Categorical feature to encode. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Encoded feature. If `sparse=True` a sparse matrix is returned. + """ + diff, X_mask = _encode_check_unknown(X, self.categories_, + return_mask=True) + if not np.all(X_mask): + if self.handle_unknown == 'error': + msg = ("Found unknown categories {0} in column {1} " + "during transform".format(diff, self.feature_idx)) + raise ValueError(msg) + + # cast Xi into the largest string type necessary + # to handle different lengths of numpy strings + if (self.categories_.dtype.kind in ('U', 'S') + and self.categories_.itemsize > X.itemsize): + X = X.astype(self.categories_.dtype) + else: + X = X.copy() + X[~X_mask] = self.categories_[0] + + _, X_encoded = _encode(X, self.categories_, encode=True, + check_unknown=False) + + if self.drop_idx_ is not None: + keep_cells = X_encoded != self.drop_idx_ + X_mask &= keep_cells + + # adjust encoding to remove the dropped column + X_encoded[X_encoded > self.drop_idx_] -= 1 + + n_samples = X.shape[0] + row = np.flatnonzero(X_mask) + col = X_encoded[X_mask] + data = np.ones(np.sum(X_mask), dtype=self.dtype) + out = sparse.csc_matrix((data, (row, col)), + shape=(n_samples, self.n_features_out_)) + if self.sparse: + return out + else: + return out.toarray() + + def inverse_transform(self, X): + """Inverse transform to a single categorical feature. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples,) + The transformed data of a single feature. + + Returns + ------- + X_tr : ndarray of shape (n_samples, 1) + Inverse transform. + """ + # Only happens if there was a column with a unique + # category. In this case we just fill the column with this + # unique category value. + if self.n_features_out_ == 0: + value = self.categories_[self.drop_idx_] + return np.full((X.shape[0], 1), value) + + if self.drop_idx_ is None: + cats = self.categories_ + else: + cats = np.delete(self.categories_, self.drop_idx_) + + # for sparse X argmax returns 2D matrix, ensure 1D array + labels = np.asarray(_argmax(X, axis=1)).flatten() + X_tr = cats[labels] + if self.handle_unknown == 'ignore': + unknown = np.asarray(X.sum(axis=1) == 0).flatten() + # ignored unknown categories: we have a row of all zero + if unknown.any(): + if X_tr.dtype != object: + X_tr = X_tr.astype(object) + X_tr[unknown] = None + # drop will either be None or handle_unknown will be error. If + # self.drop is not None, then we can safely assume that all of + # the nulls in each column are the dropped value + elif self.drop_idx_ is not None: + dropped = np.asarray(X.sum(axis=1) == 0).flatten() + if dropped.any(): + X_tr[dropped] = self.categories_[self.drop_idx_] + + return X_tr[:, None] + + +class OrdinalEncoder(_EncoderUnion): """Encode categorical features as an integer array. The input to this transformer should be an array-like of integers or @@ -586,77 +679,154 @@ class OrdinalEncoder(_BaseEncoder): sklearn.preprocessing.LabelEncoder : encodes target labels with values between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64): self.categories = categories self.dtype = dtype + def _fit(self, X): + """Validate keywords and fit `X` and return `X_list`.""" + X_list = self._check_X(X) + categories = self._check_categories(len(X_list)) + + encoders = [_SingleOrdinalEncoder(categories=cat, + dtype=self.dtype, + feature_idx=idx) + for idx, cat in enumerate(categories)] + + super()._fit_list(X_list, encoders) + return X_list + def fit(self, X, y=None): - """Fit the OrdinalEncoder to X. + """Fit OneHotEncoder to X. Parameters ---------- - X : array-like, shape [n_samples, n_features] - The data to determine the categories of each feature. + X : array-like of shape (n_samples, n_features) + The data to determine the categroies of each feature. Returns ------- self - """ self._fit(X) - return self def transform(self, X): - """Transform X to ordinal codes. + """Transform X to encoding Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like of shape (n_samples, n_features) The data to encode. Returns ------- - X_out : sparse matrix or a 2-d array - Transformed input. + X_out : ndarray of shape (n_samples, n_encoded_features) + Transformed array. + """ + check_is_fitted(self) + X_list = self._check_X(X) + return super()._transform_list(X_list) + def fit_transform(self, X, y=None): + """Fit encoder to X and transform X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to encode. + + Returns + ------- + X_out : ndarray of shape (n_samples, n_encoded_features) + Transformed array. """ - X_int, _ = self._transform(X) - return X_int.astype(self.dtype, copy=False) + X_list = self._fit(X) + return super()._transform_list(X_list) - def inverse_transform(self, X): - """Convert the data back to the original representation. + +class _SingleOrdinalEncoder(TransformerMixin, BaseEstimator): + """Ordinal Encoder for a single categorical feature.""" + def __init__(self, categories='auto', dtype=np.float64, feature_idx=0): + self.categories = categories + self.dtype = dtype + self.feature_idx = feature_idx + + def fit(self, X, y=None): + """Fit ordinal encoder on a single categorical feature. Parameters ---------- - X : array-like or sparse matrix, shape [n_samples, n_encoded_features] - The transformed data. + X : ndarray of shape (n_samples,) + Categorical feature to encode. Returns ------- - X_tr : array-like, shape [n_samples, n_features] - Inverse transformed array. + self + """ + if isinstance(self.categories, str) and self.categories == 'auto': + cats = _encode(X) + else: # categories were given + cats = np.array(self.categories, dtype=X.dtype) + if X.dtype != object and not np.all(np.sort(cats) == cats): + raise ValueError("Unsorted categories are not " + "supported for numerical categories") + diff = _encode_check_unknown(X, cats) + if diff: + msg = ("Found unknown categories {0} in column {1} " + "during fit".format(diff, self.feature_idx)) + raise ValueError(msg) + + self.categories_ = cats + self.n_features_out_ = 1 + return self + + def transform(self, X): + """Transform on an single categorical feature. + Parameters + ---------- + X : ndarray of shape (n_samples,) + Categorical feature to encode. + + Returns + ------- + X_tr : ndarray of shape (n_samples, 1) + Encoded categorical feature. """ - check_is_fitted(self) - X = check_array(X, accept_sparse='csr') + diff, valid_mask = _encode_check_unknown(X, self.categories_, + return_mask=True) + if not np.all(valid_mask): + if self.handle_unknown == 'error': + msg = ("Found unknown categories {0} in column {1} " + "during transform".format(diff, self.feature_idx)) + raise ValueError(msg) - n_samples, _ = X.shape - n_features = len(self.categories_) + # cast Xi into the largest string type necessary + # to handle different lengths of numpy strings + if (self.categories_.dtype.kind in ('U', 'S') + and self.categories_.itemsize > X.itemsize): + X = X.astype(self.categories_.dtype) + else: + X = X.copy() + X[~valid_mask] = self.categories_[0] - # validate shape of passed X - msg = ("Shape of the passed X data is not correct. Expected {0} " - "columns, got {1}.") - if X.shape[1] != n_features: - raise ValueError(msg.format(n_features, X.shape[1])) + _, encoded = _encode(X, self.categories_, encode=True, + check_unknown=False) + return encoded[:, None].astype(self.dtype, copy=False) - # create resulting array of appropriate dtype - dt = np.find_common_type([cat.dtype for cat in self.categories_], []) - X_tr = np.empty((n_samples, n_features), dtype=dt) + def inverse_transform(self, X): + """Convert the data back into the original representation. - for i in range(n_features): - labels = X[:, i].astype('int64', copy=False) - X_tr[:, i] = self.categories_[i][labels] + Parameters + ---------- + X : ndarray of shape (n_samples,) + Transformed data. - return X_tr + Returns + ------- + X_tr : ndarry of shape (n_samples, 1) + Inverse transformed array. + """ + labels = X.astype('int64', copy=False) + return self.categories_[labels]