diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 709239687158e..bf76d499b464b 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -378,10 +378,10 @@ Encoding categorical features Often features are not given as continuous values but categorical. For example a person could have features ``["male", "female"]``, ``["from Europe", "from US", "from Asia"]``, -``["uses Firefox", "uses Chrome", "uses Safari", "uses Internet Explorer"]``. +``["Firefox", "Chrome", "Safari", "Internet Explorer"]``. Such features can be efficiently coded as integers, for instance -``["male", "from US", "uses Internet Explorer"]`` could be expressed as -``[0, 1, 3]`` while ``["female", "from Asia", "uses Chrome"]`` would be +``["male", "from US", "Internet Explorer"]`` could be expressed as +``[0, 1, 3]`` while ``["female", "from Asia", "Chrome"]`` would be ``[1, 2, 1]``. Such integer representation can not be used directly with scikit-learn estimators, as these @@ -397,31 +397,42 @@ only one active. Continuing the example above:: >>> enc = preprocessing.OneHotEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS - OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values='auto', sparse=True) - >>> enc.transform([[0, 1, 3]]).toarray() - array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]]) + >>> enc.fit([['female', 'from US', 'Chrome'], + ... ['male', 'from Asia', 'Firefox']]) \ + ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + OneHotEncoder(categorical_features='all', + dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, + sparse=True, values='auto') + >>> enc.transform([['female', 'from Asia', 'Firefox']]).toarray() + array([[ 1., 0., 1., 0., 0., 1.]]) By default, how many values each feature can take is inferred automatically from the dataset. -It is possible to specify this explicitly using the parameter ``n_values``. +It is possible to specify this explicitly using the parameter ``values``. There are two genders, three possible continents and four web browsers in our dataset. Then we fit the estimator, and transform a data point. -In the result, the first two numbers encode the gender, the next set of three -numbers the continent and the last four the web browser. +In the result, the first two values are genders, the next set of three +values are the continents and the last values are web browsers. Note that, if there is a possibilty that the training data might have missing categorical features, one has to explicitly set ``n_values``. For example, - >>> enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4]) - >>> # Note that there are missing categorical values for the 2nd and 3rd - >>> # features - >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS - OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values=[2, 3, 4], sparse=True) - >>> enc.transform([[1, 0, 0]]).toarray() - array([[ 0., 1., 1., 0., 0., 1., 0., 0., 0.]]) + >>> browsers = ['Internet Explorer', 'Chrome' , 'Safari', 'Firefox'] + >>> genders = ['male', 'female'] + >>> locations = ['from Europe', 'from Asia', 'from US'] + >>> enc = preprocessing.OneHotEncoder(values=[genders, locations, browsers]) + >>> # Note that for there are missing categorical values for the + >>> # 2nd and 3rd feature + >>> enc.fit([['female', 'from US', 'Chrome'], + ... ['male', 'from Asia', 'Internet Explorer']]) \ + ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + OneHotEncoder(categorical_features='all', + dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, + sparse=True, + values=[...]) + + >>> enc.transform([['male', 'from Europe', 'Safari']]).toarray() + array([[ 0., 1., 0., 1., 0., 0., 0., 0., 1.]]) See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as integers. diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 9a092310f4924..bd3a5def36675 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -171,6 +171,16 @@ Enhancements removed by setting it to `None`. :issue:`7674` by:user:`Yichuan Liu `. + - :class:`preprocessing.OneHotEncoder` now fits and transforms inputs of + any numerical or string type instead of only integer arrays. + It has addtional fitted attributes ``feature_index_range_``, + ``one_hot_feature_index_``, and ``categories_``. + In addition to previous allowed values, ``handle_unknown`` accepts "error-strict" + to error if any unknown values are seen during tranformation. + :issue:`7327` and :issue:`8793` by + :user:`Vighnesh Birodkar ` and + :user:`Stephen Hoover `. + Bug fixes ......... - Fixed a bug where :class:`sklearn.ensemble.IsolationForest` uses an @@ -329,6 +339,15 @@ API changes summary the weighted impurity decrease from splitting is no longer alteast ``min_impurity_decrease``. :issue:`8449` by `Raghav RV_` + - In :class:`preprocessing.OneHotEncoder`, deprecate the + ``feature_indices_`` and ``active_features_`` attributes. + Deprecate integer and list of integer inputs to ``values`` + in favor of lists of lists of categories. + The present behavior of ``handle_unknown="error"`` will + change to be the same as ``handle_unknown="error-strict"`` in v0.21. + :issue:`7327` and :issue:`8793` by + :user:`Vighnesh Birodkar ` and + :user:`Stephen Hoover `. .. _changes_0_18_1: @@ -5070,4 +5089,4 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Anish Shah: https://github.com/AnishShah .. _Neeraj Gangwar: http://neerajgangwar.in -.. _Arthur Mensch: https://amensch.fr \ No newline at end of file +.. _Arthur Mensch: https://amensch.fr diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 093137d078000..ee21c6726e620 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -7,7 +7,6 @@ # License: BSD 3 clause from itertools import chain, combinations -import numbers import warnings from itertools import combinations_with_replacement as combinations_w_r @@ -26,6 +25,8 @@ mean_variance_axis, incr_mean_variance_axis, min_max_axis) from ..utils.validation import check_is_fitted, FLOAT_DTYPES +from .label import LabelEncoder +from ..utils.fixes import in1d zip = six.moves.zip @@ -1618,37 +1619,41 @@ def add_dummy_feature(X, value=1.0): return np.hstack((np.ones((n_samples, 1)) * value, X)) -def _transform_selected(X, transform, selected="all", copy=True): - """Apply a transform function to portion of selected features +def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True, + return_val=True): + """Apply a function to portion of selected features Parameters ---------- - X : {array-like, sparse matrix}, shape [n_samples, n_features] + X : {array, sparse matrix}, shape [n_samples, n_features] Dense array or sparse matrix. - transform : callable A callable transform(X) -> X_transformed - + dtype : dtype + Cast outputs to this data type copy : boolean, optional Copy X even if it could be avoided. - selected: "all" or array of indices or mask Specify which features to apply the transform to. + return_val : boolean, optional + Whether to return the transformed matrix. If not set `None` is + returned. Returns ------- X : array or sparse matrix, shape=(n_samples, n_features_new) """ - X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES) + if copy: + X = X.copy() if isinstance(selected, six.string_types) and selected == "all": - return transform(X) + X_trans = transform(X) + return X_trans.astype(dtype) if return_val else None if len(selected) == 0: - return X + return X.astype(dtype) if return_val else None n_features = X.shape[1] - ind = np.arange(n_features) sel = np.zeros(n_features, dtype=bool) sel[np.asarray(selected)] = True not_sel = np.logical_not(sel) @@ -1656,28 +1661,31 @@ def _transform_selected(X, transform, selected="all", copy=True): if n_selected == 0: # No features selected. - return X + return X.astype(dtype) if return_val else None elif n_selected == n_features: # All features selected. - return transform(X) + X_trans = transform(X) + return X_trans.astype(dtype) if return_val else None else: + ind = np.arange(n_features) X_sel = transform(X[:, ind[sel]]) - X_not_sel = X[:, ind[not_sel]] - if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): - return sparse.hstack((X_sel, X_not_sel)) - else: - return np.hstack((X_sel, X_not_sel)) + if return_val: + X_sel = X_sel.astype(dtype) + X_not_sel = X[:, ind[not_sel]].astype(dtype) + if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): + return sparse.hstack((X_sel, X_not_sel)) + else: + return np.hstack((X_sel, X_not_sel)) class OneHotEncoder(BaseEstimator, TransformerMixin): - """Encode categorical integer features using a one-hot aka one-of-K scheme. + """Encode categorical features using a one-hot aka one-of-K scheme. - The input to this transformer should be a matrix of integers, denoting - the values taken on by categorical (discrete) features. The output will be - a sparse matrix where each column corresponds to one possible value of one - feature. It is assumed that input features take on values in the range - [0, n_values). + The input to this transformer should be a matrix of integers or strings, + denoting the values taken on by categorical (discrete) features. The + output will be a sparse matrix where each column corresponds to one + possible value of one feature. This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. @@ -1689,15 +1697,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - n_values : 'auto', int or array of ints - Number of values per feature. - - - 'auto' : determine value range from training data. - - int : number of categorical values per feature. - Each feature value should be in ``range(n_values)`` - - array : ``n_values[i]`` is the number of categorical values in - ``X[:, i]``. Each feature value should be - in ``range(n_values[i])`` + values : 'auto' or List[List[objects]] + - 'auto' (default) : Encoded values are those found in training data. + - list of lists : values for feature ``i`` are in ``values[i]`` categorical_features : "all" or array of indices or mask Specify what features are treated as categorical. @@ -1708,49 +1710,63 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): Non-categorical features are always stacked to the right of the matrix. - dtype : number type, default=np.float + dtype : number type, default=np.float64 Desired dtype of output. sparse : boolean, default=True Will return sparse matrix if set True else will return an array. - handle_unknown : str, 'error' or 'ignore' - Whether to raise an error or ignore if a unknown categorical feature is - present during transform. + handle_unknown : {'error', 'error-strict', 'ignore'} + - 'ignore': Ignore all unknown feature values. + - 'error': Raise an error when the value of an integer feature is more + than the maximum value seen during fit or less than zero, or when + the value of a non-integer feature was unseen during ``fit``. + - 'error-strict': Raise an error when the value of a feature is unseen + during ``fit``. Attributes ---------- - active_features_ : array - Indices for active features, meaning values that actually occur - in the training set. Only available when n_values is ``'auto'``. + feature_index_range_ : array, shape (n_feature, 2) + ``feature_index_range_[i]`` specifies the range of column indices + occupied by the input feature `i` in the one-hot encoded array. - feature_indices_ : array of shape (n_features,) - Indices to feature ranges. - Feature ``i`` in the original data is mapped to features - from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - (and then potentially masked by `active_features_` afterwards) + one_hot_feature_index_ : array, shape (n_features_new,) + ``one_hot_feature_index_[i]`` specifies which feature of the input + is encoded by column ``i`` in the one-hot encoded array. + + categories_ : array, shape (n_features_new,) + np.object array containing the category encoded in each feature + of the output (or None for non-categorical features) n_values_ : array of shape (n_features,) - Maximum number of values per feature. + Number of encoded categories per feature. Has value `0` for + non-categorical features. Examples -------- - Given a dataset with three features and four samples, we let the encoder - find the maximum value per feature and transform the data to a binary + Given a dataset with two features and three samples, we let the encoder + find the categories in each feature and transform the data to a binary one-hot encoding. >>> from sklearn.preprocessing import OneHotEncoder >>> enc = OneHotEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ -[1, 0, 2]]) # doctest: +ELLIPSIS - OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, - handle_unknown='error', n_values='auto', sparse=True) + >>> enc.fit(np.array([['cat', 4], ['mouse', 15], ['dog', 17]], dtype='O'))\ + # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + OneHotEncoder(categorical_features='all', + dtype=<... 'numpy.float64'>, handle_unknown='error', n_values=None, + sparse=True, values='auto') >>> enc.n_values_ - array([2, 3, 4]) - >>> enc.feature_indices_ - array([0, 2, 5, 9]) - >>> enc.transform([[0, 1, 1]]).toarray() - array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]]) + array([3, 3]) + >>> enc.feature_index_range_ + array([[0, 3], + [3, 6]]) + >>> enc.one_hot_feature_index_ + array([0, 0, 0, 1, 1, 1]) + >>> (enc.categories_ == + ... np.array(['cat', 'dog', 'mouse', 4, 15, 17], dtype='O')).all() + True + >>> enc.transform([['dog', 4]]).toarray() + array([[ 0., 1., 0., 1., 0., 0.]]) See also -------- @@ -1766,138 +1782,286 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 and n_classes-1. """ - def __init__(self, n_values="auto", categorical_features="all", - dtype=np.float64, sparse=True, handle_unknown='error'): - self.n_values = n_values + def __init__(self, values='auto', categorical_features="all", + n_values=None, dtype=np.float64, sparse=True, + handle_unknown='error'): + self.values = values self.categorical_features = categorical_features self.dtype = dtype self.sparse = sparse self.handle_unknown = handle_unknown + self.n_values = n_values def fit(self, X, y=None): - """Fit OneHotEncoder to X. + """Fit the OneHotEncoder to X. Parameters ---------- X : array-like, shape [n_samples, n_feature] - Input array of type int. + Array of ints or strings or both. Returns ------- self """ - self.fit_transform(X) + if self.handle_unknown not in ['ignore', 'error', 'error-strict']: + template = ("handle_unknown should be either 'error', " + "'error-strict', or 'ignore', got %s") + raise ValueError(template % self.handle_unknown) + elif self.handle_unknown == 'error': + warnings.warn('The behavior of handle_unknown="error" is ' + 'deprecated and will be changed to be the same ' + 'as "error-strict" in version 0.21', FutureWarning) + + X = check_array(X, dtype=None, accept_sparse='csc', copy=False) + n_samples, n_features = X.shape + self.n_features_ = n_features + + _apply_selected(X, self._fit, dtype=self.dtype, return_val=False, + selected=self.categorical_features, copy=False) + + # Record which columns of output data + # correspond to each column of input data + self.feature_index_range_ = np.zeros((n_features, 2), dtype=np.int) + + if isinstance(self.categorical_features, six.string_types) and \ + self.categorical_features == "all": + categorical = np.ones(n_features, dtype=bool) + else: + categorical = np.zeros(n_features, dtype=bool) + categorical[np.asarray(self.categorical_features)] = True + + start, end = 0, 0 + for i_cat, i_feat in enumerate(np.where(categorical)[0]): + if np.isscalar(self._values) and self.handle_unknown == 'error': + end = start + self._n_active_features_[i_cat] + else: + end = start + len(self._label_encoders[i_cat].classes_) + self.feature_index_range_[i_feat] = start, end + start = end + num_cat_cols = np.sum(categorical) + non_cat_indices = np.arange(start, start + n_features - num_cat_cols) + self.feature_index_range_[~categorical, 0] = non_cat_indices + self.feature_index_range_[~categorical, 1] = non_cat_indices + 1 + + # Record which column of input data corresponds + # to each column of output data + n_cats = np.diff(self.feature_index_range_, axis=1).ravel() + inp_order = np.argsort(self.feature_index_range_[:, 0]) + self.one_hot_feature_index_ = np.repeat(inp_order, n_cats[inp_order]) + + # Count categories per feature + self.n_values_ = n_cats.copy() + self.n_values_[~categorical] = 0 + + # Store categories for each output feature + if num_cat_cols == 0: + cats = [] + else: + cats = np.concatenate([le.classes_ for le in self._label_encoders]) + if hasattr(self, '_active_features_'): + cats = cats[self._active_features_] + self.categories_ = np.hstack([cats, len(non_cat_indices) * [None]]) + return self - def _fit_transform(self, X): - """Assumes X contains only categorical features.""" - X = check_array(X, dtype=np.int) - if np.any(X < 0): - raise ValueError("X needs to contain only non-negative integers.") - n_samples, n_features = X.shape - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - n_values = np.max(X, axis=0) + 1 - elif isinstance(self.n_values, numbers.Integral): - if (np.max(X, axis=0) >= self.n_values).any(): - raise ValueError("Feature out of bounds for n_values=%d" - % self.n_values) - n_values = np.empty(n_features, dtype=np.int) - n_values.fill(self.n_values) + def _check_values(self, values, n_features): + """Verify that the input `values` is valid + + Raises ValueError or TypeError for bad `values`. + Assume that integers or lists of integers have been + converted to lists of arrays before getting here. + This should run after `_initialize_values`. + """ + error_msg = ("`values` should be 'auto', an integer, " + "a list of integers or a list of list") + if isinstance(values, six.string_types): + if values != 'auto': + raise ValueError(error_msg) + elif isinstance(values, list) or isinstance(values, np.ndarray): + if len(values) != n_features: + raise ValueError("Shape mismatch: if values is a list," + " it has to be of length (n_features).") + + # All entries must be either arrays or lists here + if any([np.isscalar(val) for val in values]): + raise ValueError(error_msg) else: - try: - n_values = np.asarray(self.n_values, dtype=int) - except (ValueError, TypeError): - raise TypeError("Wrong type for parameter `n_values`. Expected" - " 'auto', int or array of ints, got %r" - % type(X)) - if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: - raise ValueError("Shape mismatch: if n_values is an array," - " it has to be of shape (n_features,).") - - self.n_values_ = n_values - n_values = np.hstack([[0], n_values]) - indices = np.cumsum(n_values) - self.feature_indices_ = indices + raise TypeError(error_msg) - column_indices = (X + indices[:-1]).ravel() - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - n_features) - data = np.ones(n_samples * n_features) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() + def _initialize_values(self): + """Standardize the `values` input - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - mask = np.array(out.sum(axis=0)).ravel() != 0 - active_features = np.where(mask)[0] - out = out[:, active_features] - self.active_features_ = active_features + Output is either a string or a list of arrays. + """ + if self.n_values is not None: + warnings.warn('`n_values` has been renamed to `values`.' + 'The parameter `n_values` has been deprecated ' + 'and will be removed in version 0.21; use the ' + 'parameter `values` instead and specify the ' + 'expected values for each feature.', FutureWarning) + values = self.n_values + else: + values = self.values + + # Convert `int` and `Sequence[int]` inputs to `List[Array[int]]` + if (not isinstance(values, six.string_types) and + np.isscalar(values)): + warnings.warn('Integer input to `values` is deprecated and' + ' will be removed in version 0.21. Specify a ' + 'list of allowed values for each feature instead.', + FutureWarning) + values = np.ones(self.n_features_cat_, dtype=int) * values + if (not isinstance(values, six.string_types) and + np.isscalar(values[0])): + warnings.warn('List of integer input to `values` is deprecated and' + ' will be removed in version 0.21. Specify a ' + 'list of allowed values for each feature instead.', + FutureWarning) + values = [np.arange(v, dtype=np.int) for v in values] + + return values + + def _fit(self, X): + """Assumes `X` contains only categorical features""" + n_samples, n_features = X.shape + self.n_features_cat_ = n_features + self._label_encoders = [LabelEncoder() for i in range(n_features)] + + self._values = self._initialize_values() + self._check_values(self._values, n_features) + + # Fit on categorical features in the data + _auto_int_classes = n_features * [None] + for i in range(n_features): + le = self._label_encoders[i] + + if np.isscalar(self._values) and self.handle_unknown == 'error': + # For integer features, allow integers between + # 0 and column max. The transform will still only + # return dummy columns for integers present in training data. + if (not isinstance(X[0, i], six.string_types) and + int(X[0, i]) == X[0, i]): + _auto_int_classes[i] = np.unique(X[:, i]).astype(int) + if np.min(_auto_int_classes[i]) < 0: + msg = ('Column %s has value(s) less than zero; all ' + 'integer columns must have minimum value ' + '0 when value="auto" and ' + 'handle_unknown="error".') + raise ValueError(msg) + n_classes = np.max(_auto_int_classes[i]) + 1 + le.fit(np.arange(n_classes)) + else: + le.fit(X[:, i]) + elif np.isscalar(self._values): + le.fit(X[:, i]) + else: + le.fit(self._values[i]) + + if np.isscalar(self._values) and self.handle_unknown == 'error': + # Record which integer features were present in training + # data so we can restrict output columns. + active_features = [] + for i_col, int_classes in enumerate(_auto_int_classes): + if int_classes is None: + n_classes = len(self._label_encoders[i_col].classes_) + active_features.append(np.ones(n_classes, dtype=bool)) + else: + n_classes = max(self._label_encoders[i_col].classes_) + 1 + this_col_mask = np.zeros(n_classes, dtype=bool) + this_col_mask[int_classes] = True + active_features.append(this_col_mask) + self._n_active_features_ = np.array([a.sum() + for a in active_features]) + self._active_features_ = np.where(np.hstack(active_features))[0] - return out if self.sparse else out.toarray() + def transform(self, X, y=None): + """Encode the selected categorical features using the one-hot scheme. - def fit_transform(self, X, y=None): - """Fit OneHotEncoder to X, then transform X. + Parameters + ---------- + X : array-like, shape [n_samples, n_feature] + Array of ints or strings or both. - Equivalent to self.fit(X).transform(X), but more convenient and more - efficient. See fit for the parameters, transform for the return value. + Returns + ------- + out : array, shape[n_samples, n_features_new] + `X` encoded using the one-hot scheme. Will be a CSR sparse + array if `self.sparse` is True. """ - return _transform_selected(X, self._fit_transform, - self.categorical_features, copy=True) + X = check_array(X, accept_sparse='csc', dtype=None, copy=False) + if X.shape[1] != self.n_features_: + raise ValueError("Input data must have %s " + "features." % self.n_features_) + + return _apply_selected(X, self._transform, dtype=self.dtype, + selected=self.categorical_features, copy=False) def _transform(self, X): - """Assumes X contains only categorical features.""" - X = check_array(X, dtype=np.int) - if np.any(X < 0): - raise ValueError("X needs to contain only non-negative integers.") + """Assumes `X` contains only categorical features.""" n_samples, n_features = X.shape + X_int = np.zeros_like(X, dtype=np.int32) + + # Recode all columns of input data as integers + if self.handle_unknown in ['error', 'error-strict']: + for i, le in enumerate(self._label_encoders): + try: + X_int[:, i] = le.transform(X[:, i]) + except ValueError as err: + orig_msg = str(err) + if not orig_msg.startswith('y contains'): + raise + else: + msg = 'Column %d %s' % (i, orig_msg[2:]) + raise ValueError(msg) + mask = slice(None) + else: + X_mask = np.ones_like(X, dtype=np.bool) + for i, le in enumerate(self._label_encoders): + valid_mask = in1d(X[:, i], le.classes_) + if not np.all(valid_mask): + X_mask[:, i] = valid_mask + X_int[valid_mask, i] = le.transform(X[valid_mask, i]) + else: + X_int[:, i] = le.transform(X[:, i]) + mask = X_mask.ravel() + + # Convert integer columns to sparse array of binary indicators + n_values = [0] + [le.classes_.shape[0] for le in self._label_encoders] + indices = np.cumsum(n_values) - indices = self.feature_indices_ - if n_features != indices.shape[0] - 1: - raise ValueError("X has different shape than during fitting." - " Expected %d, got %d." - % (indices.shape[0] - 1, n_features)) - - # We use only those categorical features of X that are known using fit. - # i.e lesser than n_values_ using mask. - # This means, if self.handle_unknown is "ignore", the row_indices and - # col_indices corresponding to the unknown categorical feature are - # ignored. - mask = (X < self.n_values_).ravel() - if np.any(~mask): - if self.handle_unknown not in ['error', 'ignore']: - raise ValueError("handle_unknown should be either error or " - "unknown got %s" % self.handle_unknown) - if self.handle_unknown == 'error': - raise ValueError("unknown categorical feature present %s " - "during transform." % X.ravel()[~mask]) - - column_indices = (X + indices[:-1]).ravel()[mask] + column_indices = (X_int + indices[:-1]).ravel()[mask] row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features)[mask] - data = np.ones(np.sum(mask)) + data = np.ones(len(row_indices), dtype=self.dtype) + out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - out = out[:, self.active_features_] - return out if self.sparse else out.toarray() + if np.isscalar(self._values) and self.handle_unknown == 'error': + out = out[:, self._active_features_] - def transform(self, X): - """Transform X using one-hot encoding. + return out if self.sparse else out.toarray() - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - Input array of type int. + @property + def active_features_(self): + warnings.warn('The property `active_features_` is deprecated and' + ' will be removed in version 0.21', FutureWarning) + if not hasattr(self, '_active_features_'): + raise AttributeError("'OneHotEncoder' object has no attribute " + "'active_features_'.") + return self._active_features_ - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array, dtype=int - Transformed input. - """ - return _transform_selected(X, self._transform, - self.categorical_features, copy=True) + @property + def feature_indices_(self): + # This is very similar to the current attribute + # `feature_index_range_`, but only applies to the + # subset of categorical features. + warnings.warn('The property `feature_indices_` is deprecated and' + ' will be removed in version 0.21', FutureWarning) + if not hasattr(self, '_label_encoders'): + raise AttributeError("'OneHotEncoder' object has no attribute " + "'feature_indices_'.") + n_categories = [len(le.classes_) for le in self._label_encoders] + return np.cumsum([0] + n_categories) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f2f7d9afad347..3957cb4a63e56 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -18,7 +18,7 @@ from ..utils.fixes import np_version from ..utils.fixes import sparse_min_max from ..utils.fixes import astype -from ..utils.fixes import in1d +from ..utils.fixes import in1d, setdiff1d from ..utils import column_or_1d from ..utils.validation import check_array from ..utils.validation import check_is_fitted @@ -149,7 +149,7 @@ def transform(self, y): classes = np.unique(y) _check_numpy_unicode_bug(classes) if len(np.intersect1d(classes, self.classes_)) < len(classes): - diff = np.setdiff1d(classes, self.classes_) + diff = setdiff1d(classes, self.classes_) raise ValueError("y contains new labels: %s" % str(diff)) return np.searchsorted(self.classes_, y) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 7a51049b60242..d526b842c961d 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -6,6 +6,7 @@ # License: BSD 3 clause import warnings +import re import numpy as np import numpy.linalg as la from scipy import sparse @@ -25,13 +26,14 @@ from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_false +from sklearn.utils.testing import assert_warns from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import skip_if_32bit from sklearn.utils.sparsefuncs import mean_variance_axis -from sklearn.preprocessing.data import _transform_selected +from sklearn.preprocessing.data import _apply_selected from sklearn.preprocessing.data import _handle_zeros_in_scale from sklearn.preprocessing.data import Binarizer from sklearn.preprocessing.data import KernelCenterer @@ -1470,13 +1472,13 @@ def test_one_hot_encoder_sparse(): [1., 0., 1., 0., 1.]]) # max value given as 3 - enc = OneHotEncoder(n_values=4) + enc = OneHotEncoder(values=4) X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 4 * 3)) assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) # max value given per feature - enc = OneHotEncoder(n_values=[3, 2, 2]) + enc = OneHotEncoder(values=[3, 2, 2]) X = [[1, 0, 1], [0, 1, 1]] X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 3 + 2 + 2)) @@ -1488,9 +1490,14 @@ def test_one_hot_encoder_sparse(): # test that an error is raised when out of bounds: X_too_large = [[0, 2, 1], [0, 1, 1]] assert_raises(ValueError, enc.transform, X_too_large) - error_msg = "unknown categorical feature present \[2\] during transform." + error_msg = re.escape("Column 1 contains new labels: [2]") assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) - assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X) + + error_msg = re.escape("Column 0 contains new labels: [2]") + assert_raises_regex(ValueError, error_msg, + OneHotEncoder(n_values=2).fit_transform, X) + assert_raises_regex(ValueError, error_msg, + OneHotEncoder(values=2).fit_transform, X) # test that error is raised when wrong number of features assert_raises(ValueError, enc.transform, X[:, :-1]) @@ -1500,13 +1507,77 @@ def test_one_hot_encoder_sparse(): # test exception on wrong init param assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) + +def test_one_hot_encoder_with_negative_integers(): + # Negative numerical values in inputs should raise an exception + X_bad = np.array([[-1, "cat"], [10, "mouse"], [5, "cat"]], dtype=np.object) + X_good = np.array([[1, "cat"], [10, "mouse"], [5, "cat"]], dtype=np.object) + assert_raises(ValueError, OneHotEncoder().fit, X_bad) + + ohe = OneHotEncoder().fit(X_good) + assert_raises(ValueError, ohe.transform, X_bad) + + # Negative values are okay with "error-strict" + OneHotEncoder(handle_unknown='error-strict').fit_transform(X_bad) + + +def test_one_hot_encoder_attr(): + X = np.array([[1, 7, "cat"], [10, 15, "mouse"], [5, 7, "cat"]], dtype='O') + enc = OneHotEncoder() - # test negative input to fit - assert_raises(ValueError, enc.fit, [[0], [-1]]) + enc.fit(X) + assert_array_equal(enc.feature_index_range_, [[0, 3], [3, 5], [5, 7]]) + assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 1, 1, 2, 2]) + assert_array_equal(enc.n_values_, [3, 2, 2]) + assert_array_equal(enc.categories_, + np.array([1, 5, 10, 7, 15, 'cat', 'mouse'], dtype='O')) + + enc = OneHotEncoder('auto', handle_unknown='error-strict', + categorical_features=[True, False, True]) + enc.fit(X) + assert_array_equal(enc.feature_index_range_, [[0, 3], [5, 6], [3, 5]]) + assert_array_equal(enc.one_hot_feature_index_, [0, 0, 0, 2, 2, 1]) + assert_array_equal(enc.n_values_, [3, 0, 2]) + assert_array_equal(enc.categories_, + np.array([1, 5, 10, 'cat', 'mouse', None], dtype='O')) + + enc = OneHotEncoder(categorical_features=[False, False, True]) + enc.fit(X) + assert_array_equal(enc.feature_index_range_, [[2, 3], [3, 4], [0, 2]]) + assert_array_equal(enc.one_hot_feature_index_, [2, 2, 0, 1]) + assert_array_equal(enc.n_values_, [0, 0, 2]) + assert_array_equal(enc.categories_, + np.array(['cat', 'mouse', None, None], dtype='O')) + + +def test_one_hot_encoder_deprecations(): + # Check that deprecated features raise warnings + X = [[3, 2, 1], [0, 1, 1]] + + # `handle_unknown`="error" will change in v0.21 + ohe = OneHotEncoder(handle_unknown='error') + assert_warns(FutureWarning, ohe.fit, X) + + # `n_values` is deprecated + ohe = OneHotEncoder(n_values='auto', handle_unknown='ignore') + assert_warns(FutureWarning, ohe.fit, X) - # test negative input to transform - enc.fit([[0], [1]]) - assert_raises(ValueError, enc.transform, [[0], [-1]]) + # Integer input for `values` is deprecated + ohe = OneHotEncoder(values=5, handle_unknown='ignore') + assert_warns(FutureWarning, ohe.fit, X) + + # List of integer input for `values` is deprecated + ohe = OneHotEncoder(values=[5, 5, 5], handle_unknown='ignore') + assert_warns(FutureWarning, ohe.fit, X) + + # `active_features_` is deprecated (and is only available + # when `handle_unknown`="error") + ohe = OneHotEncoder(handle_unknown='error').fit(X) + assert_warns(FutureWarning, getattr, ohe, 'active_features_') + + # `feature_indices_` is deprecated + ohe = OneHotEncoder(handle_unknown='ignore').fit(X) + assert_warns(FutureWarning, getattr, ohe, 'feature_indices_') def test_one_hot_encoder_dense(): @@ -1526,26 +1597,31 @@ def test_one_hot_encoder_dense(): [1., 0., 1., 0., 1.]])) -def _check_transform_selected(X, X_expected, sel): +def _check_apply_selected(X, X_expected, sel, dtype=np.float): for M in (X, sparse.csr_matrix(X)): - Xtr = _transform_selected(M, Binarizer().transform, sel) + Xtr = _apply_selected(M, Binarizer().transform, sel, dtype=dtype) assert_array_equal(toarray(Xtr), X_expected) + assert_equal(toarray(Xtr).dtype, dtype) def test_transform_selected(): - X = [[3, 2, 1], [0, 1, 1]] + X = np.array([[3, 2, 1], [0, 1, 1]]) X_expected = [[1, 2, 1], [0, 1, 1]] - _check_transform_selected(X, X_expected, [0]) - _check_transform_selected(X, X_expected, [True, False, False]) + _check_apply_selected(X, X_expected, [0]) + _check_apply_selected(X, X_expected, [True, False, False]) + _check_apply_selected(X, X_expected, [True, False, False], dtype=np.int) X_expected = [[1, 1, 1], [0, 1, 1]] - _check_transform_selected(X, X_expected, [0, 1, 2]) - _check_transform_selected(X, X_expected, [True, True, True]) - _check_transform_selected(X, X_expected, "all") + _check_apply_selected(X, X_expected, [0, 1, 2]) + _check_apply_selected(X, X_expected, [0, 1, 2], dtype=np.int) + _check_apply_selected(X, X_expected, [True, True, True]) + _check_apply_selected(X, X_expected, "all") + _check_apply_selected(X, X_expected, "all", dtype=np.int) - _check_transform_selected(X, X, []) - _check_transform_selected(X, X, [False, False, False]) + _check_apply_selected(X, X, []) + _check_apply_selected(X, X, [False, False, False]) + _check_apply_selected(X, X, [False, False, False], dtype=np.int) def test_transform_selected_copy_arg(): @@ -1558,8 +1634,8 @@ def _mutating_transformer(X): expected_Xtr = [[2, 2], [3, 4]] X = original_X.copy() - Xtr = _transform_selected(X, _mutating_transformer, copy=True, - selected='all') + Xtr = _apply_selected(X, _mutating_transformer, copy=True, + selected='all') assert_array_equal(toarray(X), toarray(original_X)) assert_array_equal(toarray(Xtr), expected_Xtr) @@ -1572,7 +1648,7 @@ def _run_one_hot(X, X2, cat): return Xtr, X2tr -def _check_one_hot(X, X2, cat, n_features): +def _check_one_hot(X, X2, cat, n_features, X_exp, X2_exp): ind = np.where(cat)[0] # With mask A, B = _run_one_hot(X, X2, cat) @@ -1587,44 +1663,115 @@ def _check_one_hot(X, X2, cat, n_features): assert_array_equal(toarray(A), toarray(C)) assert_array_equal(toarray(B), toarray(D)) + assert_array_equal(toarray(A), X_exp) + assert_array_equal(toarray(B), X2_exp) + + +def test_one_hot_encoder_string(): + X = [['cat', 'domestic'], ['wolf', 'wild']] + enc = OneHotEncoder() + enc.fit(X) + Xtr = enc.transform([['cat', 'wild']]) + assert_array_equal(toarray(Xtr), [[1, 0, 0, 1]]) + def test_one_hot_encoder_categorical_features(): X = np.array([[3, 2, 1], [0, 1, 1]]) X2 = np.array([[1, 1, 1]]) cat = [True, False, False] - _check_one_hot(X, X2, cat, 4) + X_exp = [[0, 1, 2, 1], [1, 0, 1, 1]] + X2_exp = [[0, 0, 1, 1]] + _check_one_hot(X, X2, cat, 4, X_exp, X2_exp) # Edge case: all non-categorical cat = [False, False, False] - _check_one_hot(X, X2, cat, 3) + _check_one_hot(X, X2, cat, 3, X, X2) # Edge case: all categorical + X_exp = [[0, 1, 0, 1, 1], [1, 0, 1, 0, 1]] + X2_exp = [[0, 0, 1, 0, 1]] cat = [True, True, True] - _check_one_hot(X, X2, cat, 5) + _check_one_hot(X, X2, cat, 5, X_exp, X2_exp) + + +def test_one_hot_encoder_dtypes(): + # Verify that we can control the output dtype of the transform + X = np.array([['cat', 2.1, 1], ['dog', 1, 3], ['mouse', 1, 2]], dtype='O') + for dtype in [np.int8, np.float, np.bool]: + for sp in [True, False]: + oh = OneHotEncoder('auto', handle_unknown='error-strict', + dtype=dtype, sparse=sp) + X_tr = oh.fit_transform(X) + assert_equal(X_tr.dtype, dtype) -def test_one_hot_encoder_unknown_transform(): + +def test_one_hot_encoder_unknown_transform_int(): X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) - y = np.array([[4, 1, 1]]) + y = np.array([[0, 3, 1]]) + X_orig = X.copy() # Verify X is not modified # Test that one hot encoder raises error for unknown features # present during transform. oh = OneHotEncoder(handle_unknown='error') oh.fit(X) assert_raises(ValueError, oh.transform, y) + assert_array_equal(X, X_orig) + + # Test that there's no error for integer features in the auto range + y = [[0, 1, 1]] + assert_array_equal(toarray(oh.transform(y)), [[1, 0, 0, 0, 1, 0, 0]]) + + # But we do error when set to "error-strict" + oh = OneHotEncoder(values='auto', handle_unknown='error-strict') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) # Test the ignore option, ignores unknown features. oh = OneHotEncoder(handle_unknown='ignore') oh.fit(X) - assert_array_equal( - oh.transform(y).toarray(), - np.array([[0., 0., 0., 0., 1., 0., 0.]])) + assert_array_equal(toarray(oh.transform(y)), [[1, 0, 0, 0, 1, 0, 0]]) + assert_array_equal(X, X_orig) - # Raise error if handle_unknown is neither ignore or error. - oh = OneHotEncoder(handle_unknown='42') + +def test_one_hot_encoder_unknown_transform_object(): + X = np.array([['cat', 2.1, 1], ['dog', 1.1, 3], ['mouse', 1.1, 2]], + dtype=np.object) + y = np.array([['ET', 2.1, 1]], dtype=np.object) + X_orig = X.copy() # Verify X is not modified + + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown='error') oh.fit(X) assert_raises(ValueError, oh.transform, y) + assert_array_equal(X, X_orig) + + # Test the ignore option, ignores unknown features. + oh = OneHotEncoder(handle_unknown='ignore') + oh.fit(X) + assert_array_equal(oh.transform(y).toarray(), [[0, 0, 0, 0, 1, 1, 0, 0]]) + assert_array_equal(X, X_orig) + + # Raise error if handle_unknown is neither ignore nor error. + oh = OneHotEncoder(handle_unknown='42') + assert_raises(ValueError, oh.fit, X) + assert_array_equal(X, X_orig) + + # Check that in-range integer features are okay in object arrays + y = np.array([['cat', 2.1, 0]], dtype=np.object) + oh = OneHotEncoder(handle_unknown='error').fit(X) + assert_array_equal(oh.transform(y).toarray(), [[1, 0, 0, 0, 1, 0, 0, 0]]) + + # "in-range" but not in-training-data float features will error + y = np.array([['cat', 1.8, 1]], dtype=np.object) + oh = OneHotEncoder(handle_unknown='error').fit(X) + assert_raises(ValueError, oh.transform, y) + + # A transform on in-range integers errors in 'error-strict' mode. + oh = OneHotEncoder(values='auto', handle_unknown='error-strict').fit(X) + assert_raises(ValueError, oh.transform, y) def test_fit_cold_start(): diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index d789d5f525cd4..d44555503eaa2 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -217,13 +217,33 @@ def frombuffer_empty(buf, dtype): frombuffer_empty = np.frombuffer +def _in1d_object(ar1, ar2, invert=False): + # np.argsort(kind='mergesort') is only supported for object types after + # version 1.8. Hence in1d for object arrays needs to be handled differently + values1 = set(ar1) + values2 = set(ar2) + absent_values = values1 - values2 + + present = np.ones_like(ar1, dtype=np.bool) + + for value in absent_values: + present[ar1 == value] = False + + return ~present if invert else present + + if np_version < (1, 8): def in1d(ar1, ar2, assume_unique=False, invert=False): # Backport of numpy function in1d 1.8.1 to support numpy 1.6.2 # Ravel both arrays, behavior for the first array could be different + ar1 = np.asarray(ar1).ravel() ar2 = np.asarray(ar2).ravel() + if (ar1.dtype == object or ar2.dtype == object or + ar1.dtype.kind == 'U' or ar2.dtype.kind == 'U'): + return _in1d_object(ar1, ar2, invert) + # This code is significantly faster when the condition is satisfied. if len(ar2) < 10 * len(ar1) ** 0.145: if invert: @@ -408,3 +428,28 @@ def norm(X, ord=None, axis=None): else: norm = np.linalg.norm + + +if np_version < (1, 8): + # Backport of setdiff1d function as it relies on in1d + def setdiff1d(ar1, ar2, assume_unique=False): + # copy-paste from numpy except for the object type if clause + if assume_unique: + ar1 = np.asarray(ar1).ravel() + else: + # Unique is not supported for object arrays till np version 1.8 + # due to mergesort + if ar1.dtype == object: + ar1 = np.array(sorted(set(ar1))) + else: + ar1 = np.unique(ar1) + + if ar2.dtype == object: + ar2 = np.array(sorted(set(ar2))) + else: + ar2 = np.unique(ar2) + + return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)] + +else: + from numpy import setdiff1d # noqa