diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4c68f9e635498..f3607ee55ebdf 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -481,8 +481,9 @@ new feature of integers (0 to n_categories - 1):: >>> enc = preprocessing.OrdinalEncoder() >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] - >>> enc.fit(X) # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) + >>> enc.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, + missing_values=nan) >>> enc.transform([['female', 'from US', 'uses Safari']]) array([[0., 1., 1.]]) @@ -505,8 +506,9 @@ Continuing the example above:: >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] >>> enc.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features=None, categories=None, drop=None, - dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values=None, sparse=True) + dtype=<... 'numpy.float64'>, handle_missing='all-zero', + handle_unknown='error', missing_values=nan, n_values=None, + sparse=True) >>> enc.transform([['female', 'from US', 'uses Safari'], ... ['male', 'from Europe', 'uses Safari']]).toarray() array([[1., 0., 0., 1., 0., 1.], @@ -530,10 +532,10 @@ dataset:: >>> # feature >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] >>> enc.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - OneHotEncoder(categorical_features=None, - categories=[...], drop=None, - dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values=None, sparse=True) + OneHotEncoder(categorical_features=None, categories=[...], drop=None, + dtype=<... 'numpy.float64'>, handle_missing='all-zero', + handle_unknown='error', missing_values=nan, n_values=None, + sparse=True) >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray() array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]]) @@ -549,8 +551,9 @@ columns for this feature will be all zeros >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] >>> enc.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features=None, categories=None, drop=None, - dtype=<... 'numpy.float64'>, handle_unknown='ignore', - n_values=None, sparse=True) + dtype=<... 'numpy.float64'>, handle_missing='all-zero', + handle_unknown='ignore', missing_values=nan, n_values=None, + sparse=True) >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray() array([[1., 0., 0., 0., 0., 0.]]) diff --git a/sklearn/impute.py b/sklearn/impute.py index ea4e8663d0313..2bcbe78971389 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -61,9 +61,14 @@ def _get_mask(X, value_to_mask): # np.isnan does not work on object dtypes. return _object_dtype_isnan(X) else: - # X == value_to_mask with object dytpes does not always perform - # element-wise for old versions of numpy - return np.equal(X, value_to_mask) + if X.dtype.kind in ["S", "U"]: + # np.equal does not work for byte string and unicode types. + # However the == sign works fine. + return X == value_to_mask + else: + # X == value_to_mask with object dytpes does not always perform + # element-wise for old versions of numpy + return np.equal(X, value_to_mask) def _most_frequent(array, extra_value, n_repeat): diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 6c3df0f22178e..7c5f4b0ed86b2 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -16,8 +16,7 @@ from ..utils.validation import check_is_fitted from .base import _transform_selected -from .label import _encode, _encode_check_unknown - +from .label import _nanencode __all__ = [ 'OneHotEncoder', @@ -46,7 +45,7 @@ def _check_X(self, X): """ if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2): # if not a dataframe, do normal check_array validation - X_temp = check_array(X, dtype=None) + X_temp = check_array(X, dtype=None, force_all_finite='allow-nan') if (not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_)): X = check_array(X, dtype=np.object) @@ -56,7 +55,7 @@ def _check_X(self, X): else: # pandas dataframe, do validation later column by column, in order # to keep the dtype information to be used in the encoder. - needs_validation = True + needs_validation = 'allow-nan' n_samples, n_features = X.shape X_columns = [] @@ -76,7 +75,7 @@ def _get_feature(self, X, feature_idx): # numpy arrays, sparse arrays return X[:, feature_idx] - def _fit(self, X, handle_unknown='error'): + def _fit(self, X, missing_values, handle_unknown='error'): X_list, n_samples, n_features = self._check_X(X) if self._categories != 'auto': @@ -89,7 +88,7 @@ def _fit(self, X, handle_unknown='error'): for i in range(n_features): Xi = X_list[i] if self._categories == 'auto': - cats = _encode(Xi) + cats = _nanencode(Xi, missing_values=missing_values) else: cats = np.array(self._categories[i], dtype=Xi.dtype) if Xi.dtype != object: @@ -97,47 +96,41 @@ def _fit(self, X, handle_unknown='error'): raise ValueError("Unsorted categories are not " "supported for numerical categories") if handle_unknown == 'error': - diff = _encode_check_unknown(Xi, cats) - if diff: + try: + _nanencode(Xi, cats, encode=True, + missing_values=missing_values) + except ValueError as e: + diff = e.args[0] msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) self.categories_.append(cats) - def _transform(self, X, handle_unknown='error'): + def _transform(self, X, missing_values, handle_unknown='error'): X_list, n_samples, n_features = self._check_X(X) X_int = np.zeros((n_samples, n_features), dtype=np.int) - X_mask = np.ones((n_samples, n_features), dtype=np.bool) + X_missing_mask = np.zeros((n_samples, n_features), dtype=np.bool) + X_unknown_mask = np.zeros((n_samples, n_features), dtype=np.bool) + encode_unknown = handle_unknown != 'error' for i in range(n_features): Xi = X_list[i] - diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], - return_mask=True) - - if not np.all(valid_mask): - if handle_unknown == 'error': - msg = ("Found unknown categories {0} in column {1}" - " during transform".format(diff, i)) - raise ValueError(msg) - else: - # Set the problematic rows to an acceptable value and - # continue `The rows are marked `X_mask` and will be - # removed later. - X_mask[:, i] = valid_mask - # cast Xi into the largest string type necessary - # to handle different lengths of numpy strings - if (self.categories_[i].dtype.kind in ('U', 'S') - and self.categories_[i].itemsize > Xi.itemsize): - Xi = Xi.astype(self.categories_[i].dtype) - else: - Xi = Xi.copy() + encode_results = _nanencode(Xi, self.categories_[i], + missing_values=missing_values, + encode=True, + encode_unknown=encode_unknown) + + if len(encode_results) == 4: + _, encoded, missing_mask, unknown_mask = encode_results + X_unknown_mask[:, i] = unknown_mask + else: + _, encoded, missing_mask = encode_results - Xi[~valid_mask] = self.categories_[i][0] - _, encoded = _encode(Xi, self.categories_[i], encode=True) X_int[:, i] = encoded + X_missing_mask[:, i] = missing_mask - return X_int, X_mask + return X_int, X_missing_mask, X_unknown_mask class OneHotEncoder(_BaseEncoder): @@ -194,6 +187,16 @@ class OneHotEncoder(_BaseEncoder): dtype : number type, default=np.float Desired dtype of output. + missing_values: scalar, default=np.nan + Value to be encoded as missing + + handle_missing: 'all-zero', 'category', 'all-missing', default='all-zero' + Defines how the missing value should be handled + + - 'all-zero' : the missing value is encoded as all zeros + - 'category' : another category is appended to flag missing values + - 'all-missing' : the missing value is encoded as all nan + handle_unknown : 'error' or 'ignore', default='error'. Whether to raise an error or ignore if an unknown categorical feature is present during transform (default is to raise). When this parameter @@ -280,8 +283,9 @@ class OneHotEncoder(_BaseEncoder): ... # doctest: +ELLIPSIS ... # doctest: +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features=None, categories=None, drop=None, - dtype=<... 'numpy.float64'>, handle_unknown='ignore', - n_values=None, sparse=True) + dtype=<... 'numpy.float64'>, handle_missing='all-zero', + handle_unknown='ignore', missing_values=nan, n_values=None, + sparse=True) >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] @@ -317,12 +321,15 @@ class OneHotEncoder(_BaseEncoder): def __init__(self, n_values=None, categorical_features=None, categories=None, drop=None, sparse=True, dtype=np.float64, + missing_values=np.nan, handle_missing='all-zero', handle_unknown='error'): self.categories = categories self.sparse = sparse self.dtype = dtype self.handle_unknown = handle_unknown self.n_values = n_values + self.missing_values = missing_values + self.handle_missing = handle_missing self.categorical_features = categorical_features self.drop = drop @@ -489,7 +496,8 @@ def fit(self, X, y=None): copy=True) return self else: - self._fit(X, handle_unknown=self.handle_unknown) + self._fit(X, missing_values=self.missing_values, + handle_unknown=self.handle_unknown) self.drop_idx_ = self._compute_drop_idx() return self @@ -534,6 +542,10 @@ def _validate_keywords(self): msg = ("handle_unknown should be either 'error' or 'ignore', " "got {0}.".format(self.handle_unknown)) raise ValueError(msg) + if self.handle_missing not in ('all-zero', 'category', 'all-missing'): + msg = ("handle_unknown should be either 'all-zero', 'category', " + "or 'all-missing', got {0}.".format(self.handle_missing)) + raise ValueError(msg) # If we have both dropped columns and ignored unknown # values, there will be ambiguous cells. This creates difficulties # in interpreting the model. @@ -673,12 +685,38 @@ def _legacy_transform(self, X): return out if self.sparse else out.toarray() + def _make_onehot_sparse_matrix(self, labels, mask, cat_ns): + flat_mask = mask.ravel() + n_values = np.array([0] + cat_ns) + feature_indices = np.cumsum(n_values) + + indices = (labels + feature_indices[:-1]).ravel()[flat_mask] + indptr = mask.sum(axis=1).cumsum() + indptr = np.insert(indptr, 0, 0) + data = np.ones(len(indices)) + + out = sparse.csr_matrix((data, indices, indptr), + shape=(len(labels), feature_indices[-1]), + dtype=self.dtype) + return out + + def _make_nan_sparse_matrix(self, Xi_missing, n_categories): + n_missing = np.sum(Xi_missing) + indptr = np.cumsum(Xi_missing) * n_categories + indptr = np.insert(indptr, 0, 0) + indices = np.full((n_missing, n_categories), np.arange(n_categories)) + indices = indices.ravel() + data = np.full(len(indices), np.nan) + na_matrix = sparse.csr_matrix((data, indices, indptr), + (len(Xi_missing), n_categories)) + return na_matrix + def _transform_new(self, X): """New implementation assuming categorical input""" # validation of X happens in _check_X called by _transform - X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) - - n_samples, n_features = X_int.shape + X_int, X_missing, X_unknown = ( + self._transform(X, missing_values=self.missing_values, + handle_unknown=self.handle_unknown)) if self.drop is not None: to_drop = self.drop_idx_.reshape(1, -1) @@ -686,24 +724,30 @@ def _transform_new(self, X): # We remove all the dropped categories from mask, and decrement all # categories that occur after them to avoid an empty column. - keep_cells = X_int != to_drop - X_mask &= keep_cells + drop_cells = X_int == to_drop + X_unknown |= drop_cells X_int[X_int > to_drop] -= 1 - n_values = [len(cats) - 1 for cats in self.categories_] + cats_ns = [len(cats) - 1 for cats in self.categories_] else: - n_values = [len(cats) for cats in self.categories_] - - mask = X_mask.ravel() - n_values = np.array([0] + n_values) - feature_indices = np.cumsum(n_values) - indices = (X_int + feature_indices[:-1]).ravel()[mask] - indptr = X_mask.sum(axis=1).cumsum() - indptr = np.insert(indptr, 0, 0) - data = np.ones(n_samples * n_features)[mask] + cats_ns = [len(cats) for cats in self.categories_] + + if self.handle_missing == 'category': + for i, c in enumerate(cats_ns): + Xi_missing = X_missing[:, i] + Xi_int = X_int[:, i] + Xi_int[Xi_missing] = c + cats_ns = [c+1 for c in cats_ns] + X_valid = ~X_unknown + out = self._make_onehot_sparse_matrix(X_int, X_valid, cats_ns) + else: + X_valid = ~(X_missing | X_unknown) + out = self._make_onehot_sparse_matrix(X_int, X_valid, cats_ns) + if self.handle_missing == 'all-missing': + na_mat = [self._make_nan_sparse_matrix(X_missing[:, i], c) + for i, c in enumerate(cats_ns)] + na_mat = sparse.hstack(na_mat) + out += na_mat - out = sparse.csr_matrix((data, indices, indptr), - shape=(n_samples, feature_indices[-1]), - dtype=self.dtype) if not self.sparse: return out.toarray() else: @@ -751,16 +795,22 @@ def inverse_transform(self, X): # raise ValueError("only supported for categorical features") check_is_fitted(self, 'categories_') - X = check_array(X, accept_sparse='csr') + if self.handle_missing == 'all-missing': + force_finite = 'allow-nan' + else: + force_finite = True + X = check_array(X, accept_sparse='csr', force_all_finite=force_finite) n_samples, _ = X.shape n_features = len(self.categories_) - if self.drop is None: - n_transformed_features = sum(len(cats) - for cats in self.categories_) - else: - n_transformed_features = sum(len(cats) - 1 - for cats in self.categories_) + + n_transformed_features = sum(len(cats) for cats in self.categories_) + + if self.handle_missing == 'category': + n_transformed_features += len(self.categories_) + + if self.drop is not None: + n_transformed_features -= len(self.categories_) # validate shape of passed X msg = ("Shape of the passed X data is not correct. Expected {0} " @@ -769,17 +819,24 @@ def inverse_transform(self, X): raise ValueError(msg.format(n_transformed_features, X.shape[1])) # create resulting array of appropriate dtype - dt = np.find_common_type([cat.dtype for cat in self.categories_], []) + dt = np.find_common_type([cat.dtype for cat in self.categories_], + [type(self.missing_values)]) X_tr = np.empty((n_samples, n_features), dtype=dt) j = 0 found_unknown = {} for i in range(n_features): - if self.drop is None: - cats = self.categories_[i] - else: - cats = np.delete(self.categories_[i], self.drop_idx_[i]) + cats = self.categories_[i] + if self.drop is not None: + cats = np.delete(cats, self.drop_idx_[i]) + if self.handle_missing == 'category': + cdt = np.find_common_type([cats.dtype], + [type(self.missing_values)]) + if cdt != cats.dtype: + cats = cats.astype(cdt) + cats = np.append(cats, self.missing_values) + n_categories = len(cats) # Only happens if there was a column with a unique @@ -805,6 +862,19 @@ def inverse_transform(self, X): dropped = np.asarray(sub.sum(axis=1) == 0).flatten() if dropped.any(): X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]] + elif self.handle_missing == 'all-zeroes': + missing = np.asarray(sub.sum(axis=1) == 0).flatten() + X_tr[:, i][missing] = self.missing_values + + if self.handle_missing == 'all-missing': + if sparse.isspmatrix_csr(sub): + missing = sub.copy() + missing.data = np.isnan(missing.data) + else: + missing = np.isnan(sub) + missing = np.asarray(missing.sum(axis=1) == n_categories) + missing = missing.flatten() + X_tr[:, i][missing] = self.missing_values j += n_categories @@ -894,7 +964,10 @@ class OrdinalEncoder(_BaseEncoder): >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) ... # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) + ... # doctest: +NORMALIZE_WHITESPACE + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, + missing_values=nan) + >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 3], ['Male', 1]]) @@ -913,9 +986,11 @@ class OrdinalEncoder(_BaseEncoder): between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64): + def __init__(self, categories='auto', dtype=np.float64, + missing_values=np.nan): self.categories = categories self.dtype = dtype + self.missing_values = missing_values def fit(self, X, y=None): """Fit the OrdinalEncoder to X. @@ -933,7 +1008,7 @@ def fit(self, X, y=None): # base classes uses _categories to deal with deprecations in # OneHoteEncoder: can be removed once deprecations are removed self._categories = self.categories - self._fit(X) + self._fit(X, missing_values=self.missing_values) return self @@ -951,7 +1026,7 @@ def transform(self, X): Transformed input. """ - X_int, _ = self._transform(X) + X_int, _, _ = self._transform(X, missing_values=self.missing_values) return X_int.astype(self.dtype, copy=False) def inverse_transform(self, X): diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f7cffa1e663b5..1ce6c2ae6ba8c 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -17,7 +17,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils.sparsefuncs import min_max_axis -from ..utils import column_or_1d +from ..utils import column_or_1d, is_scalar_nan from ..utils.validation import check_array from ..utils.validation import check_is_fitted from ..utils.validation import _num_samples @@ -33,131 +33,217 @@ ] -def _encode_numpy(values, uniques=None, encode=False): - # only used in _encode below, see docstring there for details - if uniques is None: - if encode: - uniques, encoded = np.unique(values, return_inverse=True) - return uniques, encoded - else: - # unique sorts - return np.unique(values) - if encode: - diff = _encode_check_unknown(values, uniques) - if diff: - raise ValueError("y contains previously unseen labels: %s" - % str(diff)) - encoded = np.searchsorted(uniques, values) - return uniques, encoded +def _nanunique(ar, return_inverse=False): + if return_inverse: + uniques, reverse = np.unique(ar, return_inverse=return_inverse) else: - return uniques + uniques = np.unique(ar) + # np.nan is always sorted last + if len(uniques) and is_scalar_nan(uniques[-1]): + nan_idx = np.searchsorted(uniques, np.nan) + uniques = uniques[:nan_idx+1] + if return_inverse: + reverse[reverse > nan_idx] = nan_idx -def _encode_python(values, uniques=None, encode=False): - # only used in _encode below, see docstring there for details - if uniques is None: - uniques = sorted(set(values)) - uniques = np.array(uniques, dtype=values.dtype) - if encode: - table = {val: i for i, val in enumerate(uniques)} - try: - encoded = np.array([table[v] for v in values]) - except KeyError as e: - raise ValueError("y contains previously unseen labels: %s" - % str(e)) - return uniques, encoded + if return_inverse: + return uniques, reverse else: return uniques -def _encode(values, uniques=None, encode=False): - """Helper function to factorize (find uniques) and encode values. +def _nanin1d(ar1, ar2, assume_unique=False): + ar1 = np.ravel(ar1) + ar2 = np.ravel(ar2) - Uses pure python method for object dtype, and numpy method for - all other dtypes. - The numpy method has the limitation that the `uniques` need to - be sorted. Importantly, this is not checked but assumed to already be - the case. The calling method needs to ensure this for all non-object - values. + if not assume_unique: + ar1, rev = _nanunique(ar1, return_inverse=True) + ar2 = _nanunique(ar2) - Parameters - ---------- - values : array - Values to factorize or encode. - uniques : array, optional - If passed, uniques are not determined from passed values (this - can be because the user specified categories, or because they - already have been determined in fit). - encode : bool, default False - If True, also encode the values into integer codes based on `uniques`. + # The FutureWarning is usually triggered by a nan comparison so it might be + # better to just suppress the warning here + with warnings.catch_warnings(): + warnings.simplefilter(action='ignore', category=FutureWarning) + in1d = np.in1d(ar1, ar2, True) + try: + in1d[-1] = (in1d[-1] or + (is_scalar_nan(ar1[-1]) and is_scalar_nan(ar2[-1]))) + except IndexError: + pass - Returns - ------- - uniques - If ``encode=False``. The unique values are sorted if the `uniques` - parameter was None (and thus inferred from the data). - (uniques, encoded) - If ``encode=True``. + if assume_unique: + return in1d + else: + return in1d[rev] - """ - if values.dtype == object: - try: - res = _encode_python(values, uniques, encode) - except TypeError: - raise TypeError("argument must be a string or number") - return res + +def _nansetdiff1d(ar1, ar2, assume_unique=False): + if assume_unique: + ar1 = np.ravel(ar1) else: - return _encode_numpy(values, uniques, encode) + ar1 = _nanunique(ar1) + ar2 = _nanunique(ar2) + return ar1[~_nanin1d(ar1, ar2, True)] -def _encode_check_unknown(values, uniques, return_mask=False): - """ - Helper function to check for unknowns in values to be encoded. +def _nanencode_numpy(values, uniques=None, encode=False, + missing_values=np.nan, encode_unknown=False): + check_values = True + if uniques is None: + uniques = _nanunique(values) + uniques = _nansetdiff1d(uniques, [missing_values], True) + check_values = False - Uses pure python method for object dtype, and numpy method for - all other dtypes. + if encode: + if check_values: + unique_values = _nanunique(values) + unseen = _nansetdiff1d(unique_values, [missing_values], True) + unseen = _nansetdiff1d(unseen, uniques, True) - Parameters - ---------- - values : array - Values to check for unknowns. - uniques : array - Allowed uniques values. - return_mask : bool, default False - If True, return a mask of the same shape as `values` indicating - the valid values. + if not encode_unknown: + if len(unseen): + raise ValueError("y contains previously unseen labels: %s" + % str(unseen), unseen) - Returns - ------- - diff : list - The unique values present in `values` and not in `uniques` (the - unknown values). - valid_mask : boolean array - Additionally returned if ``return_mask=True``. + encoded = np.searchsorted(uniques, values) + from ..impute import _get_mask + missing_mask = _get_mask(values, missing_values) - """ - if values.dtype == object: - uniques_set = set(uniques) - diff = list(set(values) - uniques_set) - if return_mask: - if diff: - valid_mask = np.array([val in uniques_set for val in values]) + if encode_unknown: + if check_values: + unknown_mask = _nanin1d(values, unseen) else: - valid_mask = np.ones(len(values), dtype=bool) - return diff, valid_mask + unknown_mask = np.zeros_like(values, dtype=np.bool) + return uniques, encoded, missing_mask, unknown_mask else: - return diff + return uniques, encoded, missing_mask else: - unique_values = np.unique(values) - diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True)) - if return_mask: - if diff: - valid_mask = np.in1d(values, uniques) - else: - valid_mask = np.ones(len(values), dtype=bool) - return diff, valid_mask + return uniques + + +# Since it is more efficient to remove an item from a set, object exclusion +# in included in the _nanunique_object. Also since None and np.nan are not +# sortable, they are handled separately after the sort. +def _nanunique_object(ar, exclude_value): + items = set(ar) + + # nan might not be discarded since nan comes in different forms + items.discard(exclude_value) + + # Handle None afterwards if it is in the items. Set a flag for now. + try: + items.remove(None) + has_none = True + except KeyError: + has_none = False + + # Handle nan afterwards if it is in the items. Set a flag for now. Since + # nan can come in different forms, we check everything. + items_not_na = [i for i in items if not is_scalar_nan(i)] + has_na = len(items) > len(items_not_na) + + # Sort without None and nan + uniques = sorted(items_not_na) + + # Bring back None if needed + if has_none: + uniques.append(None) + + # Being back nan if needed. Since nan comes in different forms, nan might + # exists despite discarding it from the set + if has_na and not is_scalar_nan(exclude_value): + uniques.append(np.nan) + + return uniques + + +# Since nan comes in multiple forms, hash is not enough to identify it +def _dict_to_mapper(d, **kwargs): + try: + nan_value = kwargs['nan_value'] + + def mapper(x): + try: + return d[x] + except KeyError: + if is_scalar_nan(x): + return nan_value + else: + raise + return mapper + except KeyError: + return lambda x: d[x] + + +def _make_mapper(uniques, missing_values, missing_index): + if is_scalar_nan(uniques[-1]): + # Nan value is the len(uniques) - 1 + table = {val: i for i, val in enumerate(uniques[:-1])} + table[missing_values] = missing_index + table_mapper = _dict_to_mapper(table, nan_value=len(uniques) - 1) + else: + table = {val: i for i, val in enumerate(uniques)} + if is_scalar_nan(missing_values): + # Nan value is the missing index + table_mapper = _dict_to_mapper(table, nan_value=missing_index) else: - return diff + # No need for nan value + table[missing_values] = missing_index + table_mapper = _dict_to_mapper(table) + return table_mapper + + +def _nanencode_python(values, uniques=None, encode=False, + missing_values=np.nan, encode_unknown=False): + if uniques is None: + uniques = _nanunique_object(values, missing_values) + uniques = np.array(uniques, dtype=values.dtype) + + if encode: + # Use index -1 so that the number encoding will not cause failures + # if used by the consumer for indexing. It will still fail when used + # for indexing an empty array but so is any other index. + missing_index = -1 + mapper = _make_mapper(uniques, missing_values, missing_index) + + if encode_unknown: + unknown_index = -2 + + def safe_mapper(x): + try: + return mapper(x) + except KeyError: + return unknown_index + + np_mapper = safe_mapper + else: + np_mapper = mapper + + try: + encoded = np.array([np_mapper(v) for v in values], dtype=np.int) + missing_mask = encoded == missing_index + except KeyError as e: + unseen = e.args[0] + raise ValueError("y contains previously unseen labels: %s" + % str(unseen), unseen) + + if encode_unknown: + unknown_mask = encoded == unknown_index + return uniques, encoded, missing_mask, unknown_mask + else: + return uniques, encoded, missing_mask + else: + return uniques + + +def _nanencode(values, uniques=None, encode=False, + missing_values=np.nan, encode_unknown=False): + if values.dtype == object: + return _nanencode_python(values, uniques, encode, + missing_values, encode_unknown) + else: + return _nanencode_numpy(values, uniques, encode, + missing_values, encode_unknown) class LabelEncoder(BaseEstimator, TransformerMixin): @@ -217,7 +303,7 @@ def fit(self, y): self : returns an instance of self. """ y = column_or_1d(y, warn=True) - self.classes_ = _encode(y) + self.classes_ = _nanencode(y) return self def fit_transform(self, y): @@ -233,7 +319,7 @@ def fit_transform(self, y): y : array-like of shape [n_samples] """ y = column_or_1d(y, warn=True) - self.classes_, y = _encode(y, encode=True) + self.classes_, y, _ = _nanencode(y, encode=True) return y def transform(self, y): @@ -254,7 +340,7 @@ def transform(self, y): if _num_samples(y) == 0: return np.array([]) - _, y = _encode(y, uniques=self.classes_, encode=True) + _, y, _ = _nanencode(y, uniques=self.classes_, encode=True) return y def inverse_transform(self, y): diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 850efe22b5d11..caf96226fba20 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -590,24 +590,55 @@ def test_one_hot_encoder_feature_names_unicode(): assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names) +@pytest.mark.parametrize('values, missing_values', [ + (np.array([0.0, 1.0, np.nan, 2.0]), np.nan), + (np.array([0.0, 1.0, -1.0, 2.0]), -1.0), + (np.array([0, 1, -1, 2]), -1), + (np.array(list('abdc'), dtype='str'), 'd'), + (np.array(['a', 'b', None, 'c'], dtype=object), None), + (np.array(['a', 'b', np.nan, 'c'], dtype=object), np.nan), + (np.array(['a', 'b', 'd', 'c'], dtype=object), 'd'), +]) +@pytest.mark.parametrize('handle_missing, expected', [ + ('all-zero', np.array([[1, 0, 0], [0, 1, 0], [0, 0, 0], [0, 0, 1]])), + ('category', np.array([[1, 0, 0, 0], [0, 1, 0, 0], + [0, 0, 0, 1], [0, 0, 1, 0]])), + ('all-missing', np.array([[1, 0, 0], [0, 1, 0], [np.nan] * 3, [0, 0, 1]])) +]) +@pytest.mark.parametrize('sp', [True, False]) +def test_one_hot_encoder_handle_missing(values, missing_values, + handle_missing, expected, sp): + values = values.reshape(-1, 1) + enc = OneHotEncoder(categories='auto', + handle_missing=handle_missing, + missing_values=missing_values, + sparse=sp) + result = enc.fit_transform(values) + cmp_result = result.toarray() if sp else result + assert_array_equal(expected, cmp_result) + nan_mask = np.array([0, 0, 1, 0], dtype=np.bool) + + # There is a bug with assert_array_equal when comparing object arrays with + # nans. Thus, the nans are compared separately. + reverse = enc.inverse_transform(result) + assert_array_equal(values[~nan_mask], reverse[~nan_mask]) + np.testing.assert_equal(values[~nan_mask], reverse[~nan_mask]) + + @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, np.array([['a', np.nan]], dtype=object).T], ids=['numeric', 'object']) @pytest.mark.parametrize("as_data_frame", [False, True], ids=['array', 'dataframe']) @pytest.mark.parametrize("handle_unknown", ['error', 'ignore']) -def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown): +def test_one_hot_encoder_accept_missing(X, as_data_frame, handle_unknown): if as_data_frame: pd = pytest.importorskip('pandas') X = pd.DataFrame(X) ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.fit(X) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.fit_transform(X) + ohe.fit(X) + ohe.fit_transform(X) if as_data_frame: X_partial = X.iloc[:1, :] @@ -615,9 +646,7 @@ def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown): X_partial = X[:1, :] ohe.fit(X_partial) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.transform(X) + ohe.transform(X) @pytest.mark.parametrize("X", [ @@ -678,19 +707,12 @@ def test_ordinal_encoder_inverse(): @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, np.array([['a', np.nan]], dtype=object).T], ids=['numeric', 'object']) -def test_ordinal_encoder_raise_missing(X): +def test_ordinal_encoder_accept_missing(X): ohe = OrdinalEncoder() - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.fit(X) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.fit_transform(X) - + ohe.fit(X) + ohe.fit_transform(X) ohe.fit(X[:1, :]) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.transform(X) + ohe.transform(X) def test_encoder_dtypes(): @@ -732,7 +754,7 @@ def test_encoder_dtypes_pandas(): assert_array_equal(enc.transform(X).toarray(), exp) X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]}) - X_type = [int, object, float] + X_type = [X[col].dtype for col in X.columns] enc.fit(X) assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)]) assert_array_equal(enc.transform(X).toarray(), exp) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 8a7db601686a8..02f6a0edc34a9 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -25,7 +25,7 @@ from sklearn.preprocessing.label import _inverse_binarize_thresholding from sklearn.preprocessing.label import _inverse_binarize_multiclass -from sklearn.preprocessing.label import _encode +from sklearn.preprocessing.label import _nanencode from sklearn import datasets @@ -589,19 +589,160 @@ def test_inverse_binarize_multiclass(): @pytest.mark.parametrize( - "values, expected", + "values, expected, extra_value", [(np.array([2, 1, 3, 1, 3], dtype='int64'), - np.array([1, 2, 3], dtype='int64')), + np.array([1, 2, 3], dtype='int64'), + 4), (np.array(['b', 'a', 'c', 'a', 'c'], dtype=object), - np.array(['a', 'b', 'c'], dtype=object)), + np.array(['a', 'b', 'c'], dtype=object), + 'd'), (np.array(['b', 'a', 'c', 'a', 'c']), - np.array(['a', 'b', 'c']))], + np.array(['a', 'b', 'c']), + 'd')], ids=['int64', 'object', 'str']) -def test_encode_util(values, expected): - uniques = _encode(values) +def test_nanencode_util_as_encode(values, expected, extra_value): + uniques = _nanencode(values) assert_array_equal(uniques, expected) - uniques, encoded = _encode(values, encode=True) + uniques, encoded, na_mask = _nanencode(values, encode=True) assert_array_equal(uniques, expected) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) - _, encoded = _encode(values, uniques, encode=True) + assert_array_equal(na_mask, np.zeros_like(values, dtype=np.bool)) + uniques_, encoded, na_mask = _nanencode(values, uniques, encode=True) + assert_array_equal(uniques_, uniques) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) + assert_array_equal(na_mask, np.zeros_like(values, dtype=np.bool)) + + unclean_value = np.append(values, extra_value) + assert_raises(ValueError, _nanencode, unclean_value, uniques, encode=True) + + +# There is a bug with assert_array_equal when comparing object arrays with +# nans. Since nans are only seen at the end, the last items are asserted +# separately. See https://github.com/numpy/numpy/issues/9023 +def nanassert_array_equal(a, b): + # The sklearn assert_equal does not support nans + np.testing.assert_equal(a[-1], b[-1]) + assert_array_equal(a[:-1], b[:-1]) + + +@pytest.mark.parametrize( + "values, expected, missing_values", + [(np.array([2, 1, 3, 1, 3, np.nan, 2, np.float('nan')], + dtype='float64'), + np.array([1, 2, 3], dtype='float64'), np.nan), + (np.array([2, 1, np.nan, 1, np.float('nan'), 3, 2, 3], + dtype='float64'), + np.array([1, 2, np.nan], dtype='float64'), 3), + (np.array([2, 1, 3, 1, 3, 4, 2, 4], dtype='int64'), + np.array([1, 2, 3], dtype='int64'), 4), + (np.array(['b', 'a', np.nan, 'a', np.float('nan'), None, 'b', None], + dtype=object), + np.array(['a', 'b', np.nan], dtype=object), None), + (np.array(['b', 'a', None, 'a', None, np.float('nan'), 'b', np.nan], + dtype=object), + np.array(['a', 'b', None], dtype=object), np.nan), + (np.array(['b', 'a', None, 'a', None, 'c', 'b', 'c', ], dtype=object), + np.array(['a', 'b', None], dtype=object), 'c'), + (np.array(['b', 'a', 'c', 'a', 'c', 'd', 'b', 'd']), + np.array(['a', 'b', 'c']), 'd')], + ids=['float64_nan', 'float64_value', 'int64_value', + 'object_none', 'object_nan', 'object_value', 'str_value']) +def test_nanencode_util_missing(values, expected, missing_values): + encoding_answer = np.array([1, 0, 2, 0, 2, 1]) + na_mask_answer = np.array([0, 0, 0, 0, 0, 1, 0, 1], dtype=np.bool) + + uniques = _nanencode(values, missing_values=missing_values) + nanassert_array_equal(uniques, expected) + + uniques_ = _nanencode(values, uniques, False, missing_values) + nanassert_array_equal(uniques_, uniques) + + res = _nanencode(values, None, True, missing_values) + uniques, encoded, na_mask = res + nanassert_array_equal(uniques, expected) + assert_array_equal(na_mask, na_mask_answer) + assert_array_equal(encoded[~na_mask], encoding_answer) + + res = _nanencode(values, uniques, True, missing_values) + uniques_, encoded, na_mask = res + nanassert_array_equal(uniques_, uniques) + assert_array_equal(na_mask, na_mask_answer) + assert_array_equal(encoded[~na_mask], encoding_answer) + + +@pytest.mark.parametrize( + "fit_values, tr_values, expected, missing_values", + [(np.array([2, 1, 3, 1, 3, np.nan, 2, np.float('nan')], + dtype='float64'), + np.array([4, 2, 1, 5, 3, 1, 3, np.nan, 2, np.float('nan')], + dtype='float64'), + np.array([1, 2, 3], dtype='float64'), np.nan), + (np.array([2, 1, np.nan, 1, np.float('nan'), 3, 2, 3], + dtype='float64'), + np.array([4, 2, 1, 5, np.nan, 1, np.float('nan'), 3, 2, 3], + dtype='float64'), + np.array([1, 2, np.nan], dtype='float64'), 3), + (np.array([2, 1, 3, 1, 3, 4, 2, 4], dtype='int64'), + np.array([5, 2, 1, 6, 3, 1, 3, 4, 2, 4], dtype='int64'), + np.array([1, 2, 3], dtype='int64'), 4), + (np.array(['b', 'a', np.nan, 'a', np.float('nan'), None, 'b', None], + dtype=object), + np.array(['d', 'b', 'a', 'e', np.nan, 'a', np.float('nan'), + None, 'b', None], dtype=object), + np.array(['a', 'b', np.nan], dtype=object), None), + (np.array(['b', 'a', None, 'a', None, np.float('nan'), 'b', np.nan], + dtype=object), + np.array(['c', 'b', 'a', 'd', None, 'a', None, np.float('nan'), + 'b', np.nan], dtype=object), + np.array(['a', 'b', None], dtype=object), np.nan), + (np.array(['b', 'a', None, 'a', None, 'c', 'b', 'c', ], dtype=object), + np.array(['d', 'b', 'a', 'e', None, 'a', None, 'c', 'b', 'c', ], + dtype=object), + np.array(['a', 'b', None], dtype=object), 'c'), + (np.array(['b', 'a', 'c', 'a', 'c', 'd', 'b', 'd']), + np.array(['e', 'b', 'a', 'f', 'c', 'a', 'c', 'd', 'b', 'd']), + np.array(['a', 'b', 'c']), 'd')], + ids=['float64_nan', 'float64_value', 'int64_value', + 'object_none', 'object_nan', 'object_value', 'str_value']) +def test_nanencode_util_missing_unknown(fit_values, tr_values, + expected, missing_values): + encoding_answer = np.array([1, 0, 2, 0, 2, 1]) + fit_na_mask_answer = np.array([0, 0, 0, 0, 0, 1, 0, 1], dtype=np.bool) + fit_unk_mask_answer = np.zeros_like(fit_na_mask_answer) + tr_na_mask_answer = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1], + dtype=np.bool) + tr_unk_mask_answer = np.array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0], + dtype=np.bool) + + uniques = _nanencode(fit_values, None, False, missing_values, True) + nanassert_array_equal(uniques, expected) + + uniques_ = _nanencode(fit_values, uniques, False, missing_values, True) + nanassert_array_equal(uniques_, uniques) + + res = _nanencode(fit_values, None, True, missing_values, True) + uniques, encoded, na_mask, unk_mask = res + nanassert_array_equal(uniques, expected) + assert_array_equal(na_mask, fit_na_mask_answer) + assert_array_equal(unk_mask, fit_unk_mask_answer) + assert_array_equal(encoded[~(na_mask | unk_mask)], encoding_answer) + + res = _nanencode(fit_values, uniques, True, missing_values, True) + uniques_, encoded, na_mask, unk_mask = res + nanassert_array_equal(uniques_, uniques) + assert_array_equal(na_mask, fit_na_mask_answer) + assert_array_equal(unk_mask, fit_unk_mask_answer) + assert_array_equal(encoded[~(na_mask | unk_mask)], encoding_answer) + + uniques_ = _nanencode(tr_values, uniques, False, missing_values, True) + nanassert_array_equal(uniques_, uniques) + + assert_raises(ValueError, _nanencode, tr_values, uniques, True, + missing_values, False) + + res = _nanencode(tr_values, uniques, True, missing_values, True) + uniques_, encoded, na_mask, unk_mask = res + nanassert_array_equal(uniques_, uniques) + assert_array_equal(na_mask, tr_na_mask_answer) + assert_array_equal(unk_mask, tr_unk_mask_answer) + assert_array_equal(encoded[~(na_mask | unk_mask)], encoding_answer)