From 6ff7800093f510f261743d781c8ee2f7dd234139 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Sun, 18 Dec 2016 01:42:49 +0530 Subject: [PATCH 01/49] Initial commit for missing values indicator --- sklearn/preprocessing/__init__.py | 2 + sklearn/preprocessing/imputation.py | 160 +++++++++++++++++++++++++++- 2 files changed, 161 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index ba0884613c124..021abcfaf357d 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -33,6 +33,7 @@ from .label import MultiLabelBinarizer from .imputation import Imputer +from .imputation import MissingIndicator __all__ = [ @@ -42,6 +43,7 @@ 'KernelCenterer', 'LabelBinarizer', 'LabelEncoder', + 'MissingIndicator', 'MultiLabelBinarizer', 'MinMaxScaler', 'MaxAbsScaler', diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 4318122d4be6c..ae308ad6d5701 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -21,7 +21,7 @@ map = six.moves.map __all__ = [ - 'Imputer', + 'Imputer', 'MissingIndicator' ] @@ -371,3 +371,161 @@ def transform(self, X): X[coordinates] = values return X + + +class MissingIndicator(BaseEstimator, TransformerMixin): + """Missing values indicator transformer for indicating missing values. + + Parameters + ---------- + missing_values : integer or "NaN", optional (default="NaN") + The placeholder for the missing values. All occurrences of + `missing_values` will be imputed. For missing values encoded as np.nan, + use the string value "NaN". + + axis : integer, optional (default=0) + The axis along which to impute. + - If `axis=0`, then impute along columns. + - If `axis=1`, then impute along rows. + + missing_indicators : [None, "all", "train", indices/mask] + If None + If "all" + If "train" + If array + + copy : boolean, optional (default=True) + If True, a copy of X will be created. If False, imputation will + be done in-place whenever possible. Note that, in the following cases, + a new copy will always be made, even if `copy=False`: + - If X is not an array of floating values; + - If X is sparse and `missing_values=0`; + - If `axis=0` and X is encoded as a CSR matrix; + - If `axis=1` and X is encoded as a CSC matrix. + + sparse : boolean or "auto", optional (default="auto") + If True, the transformed ``X`` will be sparse type. + If False, the transformed ``X`` will be dense type + If "auto", the transformed ``X`` will be os same type as input + + Attributes + ---------- + feat_with_missing_ : array of shape(n_missing_features, ) + The features with missing values. + """ + + def __init__(self, missing_values="NaN", axis=0, + missing_indicators=None, copy=True, sparse="auto"): + self.missing_values = missing_values + self.axis = axis + self.missing_indicators = missing_indicators + self.copy = copy + self.sparse = sparse + + def fit(self, X): + """Fit the transformer on X. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + Returns + ------- + self : object + Returns self. + """ + if self.axis not in [0, 1]: + raise ValueError("Can only transform for missing value indicator" + " on axis 0 and 1, got axis={0}" + .format(self.axis)) + + if (isinstance(self.missing_indicators, six.string_types) and + self.missing_indicators not in ["train", "all"]): + raise ValueError("Can only use these optoions: 'train', 'auto' " + " got {0}".format(self.missing_indicators)) + + if self.axis == 0: + X = check_array(X, accept_sparse='csc', dtype=np.float64, + force_all_finite=False) + else: + X = check_array(X, accept_sparse='csr', dtype=np.float64, + force_all_finite=False) + + if self.missing_indicators == "train": + if sparse.issparse(X): + mask = _get_mask(X.data, self.missing_values) + mask_matrix = X.__class__((mask, X.indices.copy(), + X.indptr.copy()), shape=X.shape, + dtype=X.dtype) + if self.axis == 0: + feat_with_missing = mask_matrix.sum(axis=0).nonzero()[1] + # ravel since nonzero returns 2d matrices for sparse in scipy 0.11 + self.feat_with_missing_ = np.asarray(feat_with_missing).ravel() + + else: + mask = _get_mask(X, self.missing_values) + if self.axis == 0: + self.feat_with_missing_ = np.where(np.any(mask, axis=0))[0] + + return self + + + def transform(self, X): + """Impute all missing values in X. + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + The input data to complete. + Returns + ------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + The transformerwith missing indicator. + + """ + if self.missing_indicators == "train": + check_is_fitted(self, "feat_with_missing_") + + if self.axis == 0: + X = check_array(X, accept_sparse='csc', dtype=np.float64, + force_all_finite=False) + else: + X = check_array(X, accept_sparse='csr', dtype=np.float64, + force_all_finite=False) + + if sparse.issparse(X) and self.missing_values != 0: + # sparse matrix and missing values is not zero + imputer_mask = _get_mask(X.data, self.missing_values) + imputer_mask = X.__class__((imputer_mask, X.indices.copy(), + X.indptr.copy()), shape=X.shape, + dtype=X.dtype) + if self.axis == 0: + feat_with_missing = mask_matrix.sum(axis=0).nonzero()[1] + # ravel since nonzero returns 2d matrices for sparse in scipy 0.11 + feat_with_missing = np.asarray(feat_with_missing).ravel() + else: + # sparse with zero as missing value and dense matrix + if sparse.issparse(X): + X = X.toarray() + imputer_mask = _get_mask(X, self.missing_values) + if self.axis == 0: + feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0] + + if self.missing_indicators == "train": + features = np.setdiff1d(self.feat_with_missing_, + feat_with_missing) + if features: + warnings.warn("The features %s have missing " + "values in fit but have no missing values" + "in transform " % features, RuntimeWarning, + stacklevel=1) + X = imputer_mask[:, self.feat_with_missing_] + + elif self.missing_indicators == "all": + X = imputer_mask + + elif isinstance(self.missing_indicators, np.ndarray): + X = imputer_mask[:, self.missing_indicators] + + return X + + From 98e28a7c88e3fc7e290b5220bd633d8a1bf8af01 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Tue, 20 Dec 2016 23:56:16 +0530 Subject: [PATCH 02/49] Change documentation, remove axis and add simple test --- sklearn/preprocessing/imputation.py | 100 ++++++------------ .../preprocessing/tests/test_imputation.py | 46 ++++++++ 2 files changed, 81 insertions(+), 65 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index ae308ad6d5701..2e7a1d9a522a7 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -383,25 +383,13 @@ class MissingIndicator(BaseEstimator, TransformerMixin): `missing_values` will be imputed. For missing values encoded as np.nan, use the string value "NaN". - axis : integer, optional (default=0) - The axis along which to impute. - - If `axis=0`, then impute along columns. - - If `axis=1`, then impute along rows. - - missing_indicators : [None, "all", "train", indices/mask] - If None - If "all" - If "train" - If array - - copy : boolean, optional (default=True) - If True, a copy of X will be created. If False, imputation will - be done in-place whenever possible. Note that, in the following cases, - a new copy will always be made, even if `copy=False`: - - If X is not an array of floating values; - - If X is sparse and `missing_values=0`; - - If `axis=0` and X is encoded as a CSR matrix; - - If `axis=1` and X is encoded as a CSC matrix. + features : [None, "all", "train", array(indices/mask)] + If None, + If "all", mask will represent all features + If "train", mask will only represent features with missing values + during fit time + If mask/indices, mask will only represent features in the + indices or mask sparse : boolean or "auto", optional (default="auto") If True, the transformed ``X`` will be sparse type. @@ -414,12 +402,9 @@ class MissingIndicator(BaseEstimator, TransformerMixin): The features with missing values. """ - def __init__(self, missing_values="NaN", axis=0, - missing_indicators=None, copy=True, sparse="auto"): + def __init__(self, missing_values="NaN", features=None, sparse="auto"): self.missing_values = missing_values - self.axis = axis - self.missing_indicators = missing_indicators - self.copy = copy + self.features = features self.sparse = sparse def fit(self, X): @@ -434,38 +419,27 @@ def fit(self, X): self : object Returns self. """ - if self.axis not in [0, 1]: - raise ValueError("Can only transform for missing value indicator" - " on axis 0 and 1, got axis={0}" - .format(self.axis)) - - if (isinstance(self.missing_indicators, six.string_types) and - self.missing_indicators not in ["train", "all"]): + if (isinstance(self.features, six.string_types) and + self.features not in ["train", "all"]): raise ValueError("Can only use these optoions: 'train', 'auto' " - " got {0}".format(self.missing_indicators)) + " got {0}".format(self.features)) - if self.axis == 0: - X = check_array(X, accept_sparse='csc', dtype=np.float64, - force_all_finite=False) - else: - X = check_array(X, accept_sparse='csr', dtype=np.float64, - force_all_finite=False) + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, + force_all_finite=False) - if self.missing_indicators == "train": + if self.features == "train": if sparse.issparse(X): mask = _get_mask(X.data, self.missing_values) mask_matrix = X.__class__((mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=X.dtype) - if self.axis == 0: - feat_with_missing = mask_matrix.sum(axis=0).nonzero()[1] - # ravel since nonzero returns 2d matrices for sparse in scipy 0.11 - self.feat_with_missing_ = np.asarray(feat_with_missing).ravel() + feat_with_missing = mask_matrix.sum(axis=0).nonzero()[1] + # ravel since nonzero returns 2d matrices for sparse in scipy 0.11 + self.feat_with_missing_ = np.asarray(feat_with_missing).ravel() else: mask = _get_mask(X, self.missing_values) - if self.axis == 0: - self.feat_with_missing_ = np.where(np.any(mask, axis=0))[0] + self.feat_with_missing_ = np.where(np.any(mask, axis=0))[0] return self @@ -482,15 +456,11 @@ def transform(self, X): The transformerwith missing indicator. """ - if self.missing_indicators == "train": + if self.features == "train": check_is_fitted(self, "feat_with_missing_") - if self.axis == 0: - X = check_array(X, accept_sparse='csc', dtype=np.float64, - force_all_finite=False) - else: - X = check_array(X, accept_sparse='csr', dtype=np.float64, - force_all_finite=False) + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, + force_all_finite=False) if sparse.issparse(X) and self.missing_values != 0: # sparse matrix and missing values is not zero @@ -498,19 +468,19 @@ def transform(self, X): imputer_mask = X.__class__((imputer_mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=X.dtype) - if self.axis == 0: - feat_with_missing = mask_matrix.sum(axis=0).nonzero()[1] - # ravel since nonzero returns 2d matrices for sparse in scipy 0.11 - feat_with_missing = np.asarray(feat_with_missing).ravel() + + feat_with_missing = mask_matrix.sum(axis=0).nonzero()[1] + # ravel since nonzero returns 2d matrices for sparse in scipy 0.11 + feat_with_missing = np.asarray(feat_with_missing).ravel() + else: # sparse with zero as missing value and dense matrix if sparse.issparse(X): X = X.toarray() imputer_mask = _get_mask(X, self.missing_values) - if self.axis == 0: - feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0] + feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0] - if self.missing_indicators == "train": + if self.features == "train": features = np.setdiff1d(self.feat_with_missing_, feat_with_missing) if features: @@ -518,14 +488,14 @@ def transform(self, X): "values in fit but have no missing values" "in transform " % features, RuntimeWarning, stacklevel=1) - X = imputer_mask[:, self.feat_with_missing_] + imputer_mask = imputer_mask[:, self.feat_with_missing_] - elif self.missing_indicators == "all": - X = imputer_mask + elif self.features == "all": + imputer_mask = imputer_mask - elif isinstance(self.missing_indicators, np.ndarray): - X = imputer_mask[:, self.missing_indicators] + elif isinstance(self.features, (np.ndarray, list, tuple)): + imputer_mask = imputer_mask[:, self.features] - return X + return imputer_mask diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 663262b50289b..89c57d9346cd4 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -2,6 +2,8 @@ import numpy as np from scipy import sparse +from sklearn.base import clone + from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal @@ -10,6 +12,8 @@ from sklearn.utils.testing import ignore_warnings from sklearn.preprocessing.imputation import Imputer +from sklearn.preprocessing.imputation import MissingIndicator + from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree @@ -374,3 +378,45 @@ def test_imputation_copy(): # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is # made, even if copy=False. + + +def test_missing_indicator(): + X1 = np.array([ + [-1, -1, 1, 3], + [4, -1, 0, -1], + [8, -1, 1, 0], + [0, -1, 0, 15], + [16, -1, 1, 19] + ]) + X2 = np.array([ + [5, 1, -1, -1], + [-1, -1, 2, 3], + [2, 3, 4, 0], + [0, -1, 5, -1], + [11, -1, 1, 1] + ]) + + # features = "all": + MI = MissingIndicator(missing_values = -1) + MI.fit(X1) + X2_tr = MI.transform(X2) + mask = X2 == -1 + assert_array_equal(X2_tr, mask) + + # features = "train" + MI = clone(MI).set_params(features = "train") + MI.fit(X1) + X2_tr = MI.transform(X2) + features = MI.feat_with_missing_ + mask = X2[:, features] == -1 + assert_array_equal(X2_tr, mask) + + # features = [1, 2] + features = [1, 2] + MI = clone(MI).set_params(features = features) + MI.fit(X1) + X2_tr = MI.transform(X2) + mask = X2[:, features] == -1 + assert_array_equal(X2_tr, mask) + + From 38c58e2e26a927b7da636b3f3625c6a42ffa693d Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Thu, 22 Dec 2016 11:10:24 +0530 Subject: [PATCH 03/49] Add documentation and tests --- doc/modules/preprocessing.rst | 33 ++++++ sklearn/preprocessing/imputation.py | 105 +++++++++++------- .../preprocessing/tests/test_imputation.py | 94 ++++++++++------ 3 files changed, 157 insertions(+), 75 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 19bdfc0d432a0..86abb4944bb39 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -596,6 +596,39 @@ Imputation of missing values Tools for imputing missing values are discussed at :ref:`impute`. +.. _missing_indicator: + +Transformer indicating missing values +===================================== + +MissingIndicator transformer is useful to transform a dataset into corresponding +binary matrix indicating the presence of missing values in the dataset. +The knowledge of which features were imputed can be exploited by a downstream +estimator by adding features that indicate which elements have been imputed. + + >>> from sklearn.preprocessing import MissingIndicator + >>> import numpy as np + >>> a = MissingIndicator(missing_values = -1) + >>> X1 = np.array([ + ... [-1, -1, 1, 3], + ... [ 4, -1, 0, -1], + ... [ 8, -1, 1, 0], + ... ]) + >>> X2 = np.array([ + ... [ 5, 1, -1, -1], + ... [-1, -1, 2, 3], + ... [ 2, 3, 4, 0], + ... ]) + >>> MI = MissingIndicator(missing_values = -1) + >>> MI.fit(X1) + MissingIndicator(features='train', missing_values=-1, sparse='auto') + >>> X2_tr = MI.transform(X2) + >>> X2_tr + array([[False, False, True], + [ True, True, False], + [False, False, False]], dtype=bool) + + .. _polynomial_features: Generating polynomial features diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 2e7a1d9a522a7..ab081b9024cbd 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -383,8 +383,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): `missing_values` will be imputed. For missing values encoded as np.nan, use the string value "NaN". - features : [None, "all", "train", array(indices/mask)] - If None, + features : {'train' (default), 'all', array-like of int} If "all", mask will represent all features If "train", mask will only represent features with missing values during fit time @@ -399,10 +398,36 @@ class MissingIndicator(BaseEstimator, TransformerMixin): Attributes ---------- feat_with_missing_ : array of shape(n_missing_features, ) - The features with missing values. + The features with missing values. + Note that this is only stored if features == 'train + + Example + ------- + >>> from sklearn.preprocessing import MissingIndicator + >>> import numpy as np + >>> a = MissingIndicator(missing_values = -1) + >>> X1 = np.array([ + ... [-1, 1, 3], + ... [ 4, 0, -1], + ... [ 8, 1, 0] + ... ]) + >>> X2 = np.array([ + ... [ 5, -1, -1], + ... [-1, 2, 3], + ... [ 2, 4, 0] + ... ]) + >>> MI = MissingIndicator(missing_values = -1) + >>> MI.fit(X1) + MissingIndicator(features='train', missing_values=-1, sparse='auto') + >>> X2_tr = MI.transform(X2) + >>> X2_tr + array([[False, True], + [ True, False], + [False, False]], dtype=bool) + """ - def __init__(self, missing_values="NaN", features=None, sparse="auto"): + def __init__(self, missing_values="NaN", features="train", sparse="auto"): self.missing_values = missing_values self.features = features self.sparse = sparse @@ -421,29 +446,25 @@ def fit(self, X): """ if (isinstance(self.features, six.string_types) and self.features not in ["train", "all"]): - raise ValueError("Can only use these optoions: 'train', 'auto' " + raise ValueError("Can only use these options: 'train', 'all'" " got {0}".format(self.features)) + elif (isinstance(self.features, np.ndarray) and + not np.issubdtype(self.features.dtype, np.integer)): + raise ValueError("Features should be an array of integers") + + if not (isinstance(self.sparse, (six.string_types, bool)) and + self.sparse == "auto"): + raise ValueError("sparse can only use be boolean or 'auto'" + " got {0}".format(self.sparse)) X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) if self.features == "train": - if sparse.issparse(X): - mask = _get_mask(X.data, self.missing_values) - mask_matrix = X.__class__((mask, X.indices.copy(), - X.indptr.copy()), shape=X.shape, - dtype=X.dtype) - feat_with_missing = mask_matrix.sum(axis=0).nonzero()[1] - # ravel since nonzero returns 2d matrices for sparse in scipy 0.11 - self.feat_with_missing_ = np.asarray(feat_with_missing).ravel() - - else: - mask = _get_mask(X, self.missing_values) - self.feat_with_missing_ = np.where(np.any(mask, axis=0))[0] + _, self.feat_with_missing_ = self._get_missing_features_info(X) return self - def transform(self, X): """Impute all missing values in X. Parameters @@ -462,40 +483,42 @@ def transform(self, X): X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) - if sparse.issparse(X) and self.missing_values != 0: - # sparse matrix and missing values is not zero - imputer_mask = _get_mask(X.data, self.missing_values) - imputer_mask = X.__class__((imputer_mask, X.indices.copy(), - X.indptr.copy()), shape=X.shape, - dtype=X.dtype) - - feat_with_missing = mask_matrix.sum(axis=0).nonzero()[1] - # ravel since nonzero returns 2d matrices for sparse in scipy 0.11 - feat_with_missing = np.asarray(feat_with_missing).ravel() - - else: - # sparse with zero as missing value and dense matrix - if sparse.issparse(X): - X = X.toarray() - imputer_mask = _get_mask(X, self.missing_values) - feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0] + imputer_mask, feat_with_missing = self._get_missing_features_info(X) if self.features == "train": - features = np.setdiff1d(self.feat_with_missing_, - feat_with_missing) - if features: + features = np.setdiff1d(feat_with_missing, + self.feat_with_missing_) + if features.size: warnings.warn("The features %s have missing " - "values in fit but have no missing values" - "in transform " % features, RuntimeWarning, + "values in transform but have no missing values" + " in fit " % features, RuntimeWarning, stacklevel=1) imputer_mask = imputer_mask[:, self.feat_with_missing_] elif self.features == "all": imputer_mask = imputer_mask - elif isinstance(self.features, (np.ndarray, list, tuple)): + else: # features is array-like imputer_mask = imputer_mask[:, self.features] return imputer_mask + def _get_missing_features_info(self, X): + if sparse.issparse(X) and self.missing_values != 0: + # sparse matrix and missing values is not zero + imputer_mask = _get_mask(X.data, self.missing_values) + imputer_mask = X.__class__((imputer_mask, X.indices.copy(), + X.indptr.copy()), shape=X.shape, + dtype=X.dtype) + + feat_with_missing = imputer_mask.sum(axis=0).nonzero()[1] + feat_with_missing = np.ravel(feat_with_missing) + + else: + # sparse with zero as missing value and dense matrix + if sparse.issparse(X): + X = X.toarray() + imputer_mask = _get_mask(X, self.missing_values) + feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0] + return imputer_mask, feat_with_missing diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 89c57d9346cd4..196006e35593b 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -10,6 +10,7 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false from sklearn.utils.testing import ignore_warnings +from sklearn.utils.testing import assert_warns_message from sklearn.preprocessing.imputation import Imputer from sklearn.preprocessing.imputation import MissingIndicator @@ -381,42 +382,67 @@ def test_imputation_copy(): def test_missing_indicator(): + X1_orig = np.array([ + [-1, -1, 1, 3], + [4, -1, 0, -1], + [8, -1, 1, 0], + [0, -1, 0, 15], + [16, -1, 1, 19] + ]) + X2_orig = np.array([ + [5, 1, 1, -1], + [-1, -1, 2, 3], + [2, 3, 4, 0], + [0, -1, 5, -1], + [11, -1, 1, 1] + ]) + + for X1, X2, missing_values in [(X1_orig, X2_orig, -1), + (X1_orig + 1, X2_orig + 1, 0)]: + mask = X2 == missing_values + expect_feat_missing = np.where(np.any(X1 == missing_values, axis=0))[0] + for retype in [np.array, sparse.csr_matrix, + sparse.csc_matrix, sparse.lil_matrix]: + # features = "train": + MI = MissingIndicator(missing_values=missing_values) + MI.fit(retype(X1)) + X2_tr = MI.transform(X2) + features = MI.feat_with_missing_ + assert_array_equal(expect_feat_missing, features) + assert_array_equal(np.asarray(X2_tr), mask[:, features]) + + # features = "all" + MI = clone(MI).set_params(features="all") + MI.fit(retype(X1)) + X2_tr = MI.transform(X2) + features = np.arange(X2.shape[1]) + assert_array_equal(np.asarray(X2_tr), mask[:, features]) + + # features = [1, 2] + features = [1, 2] + MI = clone(MI).set_params(features=features) + MI.fit(retype(X1)) + X2_tr = MI.transform(X2) + assert_array_equal(np.asarray(X2_tr), mask[:, features]) + + +def test_missing_indicator_warning(): X1 = np.array([ - [-1, -1, 1, 3], - [4, -1, 0, -1], - [8, -1, 1, 0], - [0, -1, 0, 15], - [16, -1, 1, 19] + [-1, 1, 3], + [4, 0, -1], + [8, 1, 0] ]) X2 = np.array([ - [5, 1, -1, -1], - [-1, -1, 2, 3], - [2, 3, 4, 0], - [0, -1, 5, -1], - [11, -1, 1, 1] + [5, -1, -1], + [-1, 2, 3], + [2, 4, 0] ]) - - # features = "all": - MI = MissingIndicator(missing_values = -1) + MI = MissingIndicator(missing_values=-1) MI.fit(X1) - X2_tr = MI.transform(X2) - mask = X2 == -1 - assert_array_equal(X2_tr, mask) - - # features = "train" - MI = clone(MI).set_params(features = "train") - MI.fit(X1) - X2_tr = MI.transform(X2) - features = MI.feat_with_missing_ - mask = X2[:, features] == -1 - assert_array_equal(X2_tr, mask) - - # features = [1, 2] - features = [1, 2] - MI = clone(MI).set_params(features = features) - MI.fit(X1) - X2_tr = MI.transform(X2) - mask = X2[:, features] == -1 - assert_array_equal(X2_tr, mask) - - + missing_features_fit = np.where(np.any(X1 == -1, axis=0))[0] + missing_features_tr = np.where(np.any(X2 == -1, axis=0))[0] + extra_missing_features = np.setdiff1d(missing_features_tr, + missing_features_fit) + warn_msg = "The features %s have missing values in transform " \ + "but have no missing values in fit" % extra_missing_features + assert_warns_message(RuntimeWarning, warn_msg, MI.transform, X2) From 781d07dbd3f784213080cdcb44ec21656c2e9bff Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Wed, 28 Dec 2016 11:15:30 +0530 Subject: [PATCH 04/49] Add sparse option functionality --- sklearn/preprocessing/imputation.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index ab081b9024cbd..ca9b698520bd1 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -452,8 +452,8 @@ def fit(self, X): not np.issubdtype(self.features.dtype, np.integer)): raise ValueError("Features should be an array of integers") - if not (isinstance(self.sparse, (six.string_types, bool)) and - self.sparse == "auto"): + if not ((isinstance(self.sparse, six.string_types) and + self.sparse == "auto") or isinstance(self.sparse, bool)): raise ValueError("sparse can only use be boolean or 'auto'" " got {0}".format(self.sparse)) @@ -504,13 +504,14 @@ def transform(self, X): return imputer_mask def _get_missing_features_info(self, X): + print 'heredffdasf ' + str(type(X)) if sparse.issparse(X) and self.missing_values != 0: # sparse matrix and missing values is not zero imputer_mask = _get_mask(X.data, self.missing_values) imputer_mask = X.__class__((imputer_mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=X.dtype) - + print 'here' + str(type(X)) + str(type(imputer_mask)) feat_with_missing = imputer_mask.sum(axis=0).nonzero()[1] feat_with_missing = np.ravel(feat_with_missing) @@ -521,4 +522,12 @@ def _get_missing_features_info(self, X): imputer_mask = _get_mask(X, self.missing_values) feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0] + if self.sparse is True: + if sparse.issparse(imputer_mask): + imputer_mask = imputer_mask.tocsc() + else: + imputer_mask = sparse.csc_matrix(imputer_mask) + elif self.sparse is False and sparse.issparse(imputer_mask): + imputer_mask = imputer_mask.toarray() + return imputer_mask, feat_with_missing From ec6d69ab62d19f9f907c3825b0372e94dacb5126 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Tue, 3 Jan 2017 08:25:38 +0530 Subject: [PATCH 05/49] Modify tests --- sklearn/preprocessing/imputation.py | 2 - .../preprocessing/tests/test_imputation.py | 64 +++++++++++++------ 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index ca9b698520bd1..836cb9be9cd07 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -504,14 +504,12 @@ def transform(self, X): return imputer_mask def _get_missing_features_info(self, X): - print 'heredffdasf ' + str(type(X)) if sparse.issparse(X) and self.missing_values != 0: # sparse matrix and missing values is not zero imputer_mask = _get_mask(X.data, self.missing_values) imputer_mask = X.__class__((imputer_mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=X.dtype) - print 'here' + str(type(X)) + str(type(imputer_mask)) feat_with_missing = imputer_mask.sum(axis=0).nonzero()[1] feat_with_missing = np.ravel(feat_with_missing) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 196006e35593b..81f1bf0140848 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -397,33 +397,55 @@ def test_missing_indicator(): [11, -1, 1, 1] ]) + def assert_type(actual, expect, sp, missing_values): + if sp is True and missing_values != 0: + assert_equal(actual, sparse.csc_matrix) + elif (sp is True and missing_values == 0) or \ + sp is False: + assert_equal(actual, np.ndarray) + else: + print type(retype(X2)), sp, missing_values, type(X2_tr) + assert_equal(actual, expect) + + def assert_mask(actual, expected, features): + if hasattr(actual, 'toarray'): + assert_array_equal(actual.toarray(), expected[:, features]) + else: + assert_array_equal(actual, expected[:, features]) + for X1, X2, missing_values in [(X1_orig, X2_orig, -1), (X1_orig + 1, X2_orig + 1, 0)]: mask = X2 == missing_values expect_feat_missing = np.where(np.any(X1 == missing_values, axis=0))[0] for retype in [np.array, sparse.csr_matrix, sparse.csc_matrix, sparse.lil_matrix]: - # features = "train": - MI = MissingIndicator(missing_values=missing_values) - MI.fit(retype(X1)) - X2_tr = MI.transform(X2) - features = MI.feat_with_missing_ - assert_array_equal(expect_feat_missing, features) - assert_array_equal(np.asarray(X2_tr), mask[:, features]) - - # features = "all" - MI = clone(MI).set_params(features="all") - MI.fit(retype(X1)) - X2_tr = MI.transform(X2) - features = np.arange(X2.shape[1]) - assert_array_equal(np.asarray(X2_tr), mask[:, features]) - - # features = [1, 2] - features = [1, 2] - MI = clone(MI).set_params(features=features) - MI.fit(retype(X1)) - X2_tr = MI.transform(X2) - assert_array_equal(np.asarray(X2_tr), mask[:, features]) + for sp in [True, False, 'auto']: + X1_ft = retype(X1) + X2_t = retype(X2) + # features = "train": + MI = MissingIndicator(missing_values=missing_values, + sparse = sp) + + MI.fit(X1_ft) + X2_tr = MI.transform(X2_t) + features = MI.feat_with_missing_ + assert_array_equal(expect_feat_missing, features) + assert_type(type(X2_tr), type(X2_t), sp, missing_values) + assert_mask(X2_tr, mask, features) + + # features = "all" + MI = clone(MI).set_params(features="all") + MI.fit(X1_ft) + X2_tr = MI.transform(retype(X2)) + features = np.arange(X2.shape[1]) + assert_mask(X2_tr, mask, features) + + # features = [1, 2] + features = [1, 2] + MI = clone(MI).set_params(features=features) + MI.fit(X1_ft) + X2_tr = MI.transform(X2_t) + assert_mask(X2_tr, mask, features) def test_missing_indicator_warning(): From 605d189589ae2077a47d6a2ac1756e26daf1f73e Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Thu, 2 Feb 2017 13:30:55 +0530 Subject: [PATCH 06/49] Add comprehensive tests --- doc/modules/preprocessing.rst | 8 +- sklearn/preprocessing/imputation.py | 27 ++++--- .../preprocessing/tests/test_imputation.py | 76 ++++++++++--------- 3 files changed, 62 insertions(+), 49 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 86abb4944bb39..644aeb9225b45 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -601,7 +601,7 @@ Tools for imputing missing values are discussed at :ref:`impute`. Transformer indicating missing values ===================================== -MissingIndicator transformer is useful to transform a dataset into corresponding +:class:`MissingIndicator` transformer is useful to transform a dataset into corresponding binary matrix indicating the presence of missing values in the dataset. The knowledge of which features were imputed can be exploited by a downstream estimator by adding features that indicate which elements have been imputed. @@ -624,9 +624,9 @@ estimator by adding features that indicate which elements have been imputed. MissingIndicator(features='train', missing_values=-1, sparse='auto') >>> X2_tr = MI.transform(X2) >>> X2_tr - array([[False, False, True], - [ True, True, False], - [False, False, False]], dtype=bool) + array([[0, 0, 1], + [1, 1, 0], + [0, 0, 0]], dtype=int32) .. _polynomial_features: diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 836cb9be9cd07..3da2f230a4e3b 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -421,9 +421,9 @@ class MissingIndicator(BaseEstimator, TransformerMixin): MissingIndicator(features='train', missing_values=-1, sparse='auto') >>> X2_tr = MI.transform(X2) >>> X2_tr - array([[False, True], - [ True, False], - [False, False]], dtype=bool) + array([[0, 1], + [1, 0], + [0, 0]], dtype=int32) """ @@ -434,11 +434,13 @@ def __init__(self, missing_values="NaN", features="train", sparse="auto"): def fit(self, X): """Fit the transformer on X. + Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. + Returns ------- self : object @@ -466,15 +468,17 @@ def fit(self, X): return self def transform(self, X): - """Impute all missing values in X. + """Generate missing values indicator for X. + Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] The input data to complete. + Returns ------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] - The transformerwith missing indicator. + Xt : {array-like, sparse matrix}, shape = [n_samples, n_features] + The missing indicator for input data """ if self.features == "train": @@ -482,16 +486,15 @@ def transform(self, X): X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) - imputer_mask, feat_with_missing = self._get_missing_features_info(X) if self.features == "train": features = np.setdiff1d(feat_with_missing, self.feat_with_missing_) if features.size: - warnings.warn("The features %s have missing " - "values in transform but have no missing values" - " in fit " % features, RuntimeWarning, + warnings.warn("The features %s have missing values " + "in transform but have no missing values " + "in fit " % features, RuntimeWarning, stacklevel=1) imputer_mask = imputer_mask[:, self.feat_with_missing_] @@ -518,6 +521,7 @@ def _get_missing_features_info(self, X): if sparse.issparse(X): X = X.toarray() imputer_mask = _get_mask(X, self.missing_values) + imputer_mask = imputer_mask.astype(np.int32, copy=False) feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0] if self.sparse is True: @@ -527,5 +531,8 @@ def _get_missing_features_info(self, X): imputer_mask = sparse.csc_matrix(imputer_mask) elif self.sparse is False and sparse.issparse(imputer_mask): imputer_mask = imputer_mask.toarray() + elif self.sparse == 'auto' and self.missing_values != 0: + if sparse.issparse(imputer_mask): + imputer_mask = imputer_mask.tocsc() return imputer_mask, feat_with_missing diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 81f1bf0140848..ba2b9830f357d 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -397,15 +397,17 @@ def test_missing_indicator(): [11, -1, 1, 1] ]) - def assert_type(actual, expect, sp, missing_values): - if sp is True and missing_values != 0: + def assert_type(actual, is_sparse, sp, missing_values): + if sp is True : assert_equal(actual, sparse.csc_matrix) - elif (sp is True and missing_values == 0) or \ - sp is False: + elif (sp is "auto" and missing_values == 0 ) \ + or sp is False: assert_equal(actual, np.ndarray) else: - print type(retype(X2)), sp, missing_values, type(X2_tr) - assert_equal(actual, expect) + if is_sparse: + assert_equal(actual, sparse.csc_matrix) + else: + assert_equal(actual, np.ndarray) def assert_mask(actual, expected, features): if hasattr(actual, 'toarray'): @@ -413,39 +415,43 @@ def assert_mask(actual, expected, features): else: assert_array_equal(actual, expected[:, features]) - for X1, X2, missing_values in [(X1_orig, X2_orig, -1), - (X1_orig + 1, X2_orig + 1, 0)]: + def _check_missing_indicator(X1, X2, retype, sp, missing_values): mask = X2 == missing_values expect_feat_missing = np.where(np.any(X1 == missing_values, axis=0))[0] - for retype in [np.array, sparse.csr_matrix, + + X1_in = retype(X1) + X2_in = retype(X2) + # features = "train": + MI = MissingIndicator(missing_values=missing_values, + sparse = sp) + + MI.fit(X1_in) + X2_tr = MI.transform(X2_in) + features = MI.feat_with_missing_ + assert_array_equal(expect_feat_missing, features) + assert_type(type(X2_tr),sparse.issparse(X2_in), sp, missing_values) + assert_mask(X2_tr, mask, features) + + # features = "all" + MI = clone(MI).set_params(features="all") + MI.fit(X1_in) + X2_tr = MI.transform(X2_in) + features = np.arange(X2.shape[1]) + assert_mask(X2_tr, mask, features) + + # features = [1, 2] + features = [1, 2] + MI = clone(MI).set_params(features=features) + MI.fit(X1_in) + X2_tr = MI.transform(X2_in) + assert_mask(X2_tr, mask, features) + + for X1, X2, missing_values in [(X1_orig, X2_orig, -1), + (X1_orig + 1, X2_orig + 1, 0)]: + for retype in [lambda x: x.tolist(), np.array, sparse.csr_matrix, sparse.csc_matrix, sparse.lil_matrix]: for sp in [True, False, 'auto']: - X1_ft = retype(X1) - X2_t = retype(X2) - # features = "train": - MI = MissingIndicator(missing_values=missing_values, - sparse = sp) - - MI.fit(X1_ft) - X2_tr = MI.transform(X2_t) - features = MI.feat_with_missing_ - assert_array_equal(expect_feat_missing, features) - assert_type(type(X2_tr), type(X2_t), sp, missing_values) - assert_mask(X2_tr, mask, features) - - # features = "all" - MI = clone(MI).set_params(features="all") - MI.fit(X1_ft) - X2_tr = MI.transform(retype(X2)) - features = np.arange(X2.shape[1]) - assert_mask(X2_tr, mask, features) - - # features = [1, 2] - features = [1, 2] - MI = clone(MI).set_params(features=features) - MI.fit(X1_ft) - X2_tr = MI.transform(X2_t) - assert_mask(X2_tr, mask, features) + _check_missing_indicator(X1, X2, retype, sp, missing_values) def test_missing_indicator_warning(): From ca8af65aa6d3545796b1b80fbd5279c321dc0555 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Sun, 19 Feb 2017 00:47:58 +0530 Subject: [PATCH 07/49] Common tests --- sklearn/preprocessing/imputation.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 3da2f230a4e3b..5b92f9bfc7e5e 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -401,6 +401,9 @@ class MissingIndicator(BaseEstimator, TransformerMixin): The features with missing values. Note that this is only stored if features == 'train + n_features_ : int + The number of features during fit time. + Example ------- >>> from sklearn.preprocessing import MissingIndicator @@ -432,7 +435,7 @@ def __init__(self, missing_values="NaN", features="train", sparse="auto"): self.features = features self.sparse = sparse - def fit(self, X): + def fit(self, X, y=None): """Fit the transformer on X. Parameters @@ -459,8 +462,8 @@ def fit(self, X): raise ValueError("sparse can only use be boolean or 'auto'" " got {0}".format(self.sparse)) - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, - force_all_finite=False) + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64) + self.n_features_ = X.shape[1] if self.features == "train": _, self.feat_with_missing_ = self._get_missing_features_info(X) @@ -484,14 +487,16 @@ def transform(self, X): if self.features == "train": check_is_fitted(self, "feat_with_missing_") - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, - force_all_finite=False) + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64) + if X.shape[1] != self.n_features_: + raise ValueError("X has a different shape than during fitting.") + imputer_mask, feat_with_missing = self._get_missing_features_info(X) if self.features == "train": features = np.setdiff1d(feat_with_missing, self.feat_with_missing_) - if features.size: + if features.size > 0: warnings.warn("The features %s have missing values " "in transform but have no missing values " "in fit " % features, RuntimeWarning, From 07c0fce9488bb167fbbb692e18efb2219912f45e Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Sat, 4 Mar 2017 18:43:30 +0530 Subject: [PATCH 08/49] fix astype usage --- doc/modules/preprocessing.rst | 2 +- sklearn/preprocessing/imputation.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 644aeb9225b45..e155f49699d72 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -626,7 +626,7 @@ estimator by adding features that indicate which elements have been imputed. >>> X2_tr array([[0, 0, 1], [1, 1, 0], - [0, 0, 0]], dtype=int32) + [0, 0, 0]]) .. _polynomial_features: diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 5b92f9bfc7e5e..bfe276c8e5028 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -426,7 +426,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): >>> X2_tr array([[0, 1], [1, 0], - [0, 0]], dtype=int32) + [0, 0]]) """ @@ -526,7 +526,7 @@ def _get_missing_features_info(self, X): if sparse.issparse(X): X = X.toarray() imputer_mask = _get_mask(X, self.missing_values) - imputer_mask = imputer_mask.astype(np.int32, copy=False) + imputer_mask = astype(imputer_mask, int, copy=False) feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0] if self.sparse is True: From f02d78aa8ec48f69c393aa31eb979b68c6f06025 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Sat, 4 Mar 2017 22:50:17 +0530 Subject: [PATCH 09/49] pep fixes --- sklearn/preprocessing/imputation.py | 2 +- sklearn/preprocessing/tests/test_imputation.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index bfe276c8e5028..d637e3a93c2ac 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -450,7 +450,7 @@ def fit(self, X, y=None): Returns self. """ if (isinstance(self.features, six.string_types) and - self.features not in ["train", "all"]): + self.features not in ["train", "all"]): raise ValueError("Can only use these options: 'train', 'all'" " got {0}".format(self.features)) elif (isinstance(self.features, np.ndarray) and diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index ba2b9830f357d..b80cc0204f464 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -398,17 +398,16 @@ def test_missing_indicator(): ]) def assert_type(actual, is_sparse, sp, missing_values): - if sp is True : + if sp is True: assert_equal(actual, sparse.csc_matrix) - elif (sp is "auto" and missing_values == 0 ) \ - or sp is False: + elif ((sp is "auto" and missing_values == 0) or sp is False): assert_equal(actual, np.ndarray) else: if is_sparse: assert_equal(actual, sparse.csc_matrix) else: assert_equal(actual, np.ndarray) - + def assert_mask(actual, expected, features): if hasattr(actual, 'toarray'): assert_array_equal(actual.toarray(), expected[:, features]) @@ -423,13 +422,13 @@ def _check_missing_indicator(X1, X2, retype, sp, missing_values): X2_in = retype(X2) # features = "train": MI = MissingIndicator(missing_values=missing_values, - sparse = sp) + sparse=sp) MI.fit(X1_in) X2_tr = MI.transform(X2_in) features = MI.feat_with_missing_ assert_array_equal(expect_feat_missing, features) - assert_type(type(X2_tr),sparse.issparse(X2_in), sp, missing_values) + assert_type(type(X2_tr), sparse.issparse(X2_in), sp, missing_values) assert_mask(X2_tr, mask, features) # features = "all" From 2379edb1331ab2276073077412d61f3518923168 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Mon, 6 Mar 2017 23:22:43 +0530 Subject: [PATCH 10/49] Implement fit_transform --- doc/modules/preprocessing.rst | 28 ++-- sklearn/preprocessing/imputation.py | 146 +++++++++++------- .../preprocessing/tests/test_imputation.py | 60 ++++--- 3 files changed, 145 insertions(+), 89 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index e155f49699d72..605bbb76756af 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -598,7 +598,7 @@ Tools for imputing missing values are discussed at :ref:`impute`. .. _missing_indicator: -Transformer indicating missing values +Imputation of missing values ===================================== :class:`MissingIndicator` transformer is useful to transform a dataset into corresponding @@ -606,9 +606,12 @@ binary matrix indicating the presence of missing values in the dataset. The knowledge of which features were imputed can be exploited by a downstream estimator by adding features that indicate which elements have been imputed. +The ``features`` attribute is used to choose the features for which the mask is constructed. +By default, the binary matrix has only features with at least one missing value. +In case it mentioned as *all* the matrix has all the features in the input + >>> from sklearn.preprocessing import MissingIndicator >>> import numpy as np - >>> a = MissingIndicator(missing_values = -1) >>> X1 = np.array([ ... [-1, -1, 1, 3], ... [ 4, -1, 0, -1], @@ -619,15 +622,18 @@ estimator by adding features that indicate which elements have been imputed. ... [-1, -1, 2, 3], ... [ 2, 3, 4, 0], ... ]) - >>> MI = MissingIndicator(missing_values = -1) - >>> MI.fit(X1) - MissingIndicator(features='train', missing_values=-1, sparse='auto') - >>> X2_tr = MI.transform(X2) - >>> X2_tr - array([[0, 0, 1], - [1, 1, 0], - [0, 0, 0]]) - + >>> indicator = MissingIndicator(missing_values=-1) + >>> X1_tr = indicator.fit_transform(X1) + >>> X1_tr + array([[1, 1, 0], + [0, 1, 1], + [0, 1, 0]]) + >>> indicator = MissingIndicator(missing_values=-1, features="all") + >>> X1_tr = indicator.fit_transform(X1) + >>> X1_tr + array([[1, 1, 0, 0], + [0, 1, 0, 1], + [0, 1, 0, 0]]) .. _polynomial_features: diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index d637e3a93c2ac..cafcaf120344c 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -384,31 +384,21 @@ class MissingIndicator(BaseEstimator, TransformerMixin): use the string value "NaN". features : {'train' (default), 'all', array-like of int} - If "all", mask will represent all features + If "all", mask will represent all features. If "train", mask will only represent features with missing values - during fit time + during fit time. If mask/indices, mask will only represent features in the - indices or mask + indices or mask. sparse : boolean or "auto", optional (default="auto") - If True, the transformed ``X`` will be sparse type. - If False, the transformed ``X`` will be dense type - If "auto", the transformed ``X`` will be os same type as input - - Attributes - ---------- - feat_with_missing_ : array of shape(n_missing_features, ) - The features with missing values. - Note that this is only stored if features == 'train - - n_features_ : int - The number of features during fit time. + If True, the transformed ``X`` will be a sparse matrix. + If False, the transformed ``X`` will be a numpy array. + If "auto", the transformed ``X`` will be of same type as input. Example ------- >>> from sklearn.preprocessing import MissingIndicator >>> import numpy as np - >>> a = MissingIndicator(missing_values = -1) >>> X1 = np.array([ ... [-1, 1, 3], ... [ 4, 0, -1], @@ -419,15 +409,23 @@ class MissingIndicator(BaseEstimator, TransformerMixin): ... [-1, 2, 3], ... [ 2, 4, 0] ... ]) - >>> MI = MissingIndicator(missing_values = -1) - >>> MI.fit(X1) + >>> indicator = MissingIndicator(missing_values=-1) + >>> indicator.fit(X1) MissingIndicator(features='train', missing_values=-1, sparse='auto') - >>> X2_tr = MI.transform(X2) + >>> X2_tr = indicator.transform(X2) >>> X2_tr array([[0, 1], [1, 0], [0, 0]]) + Attributes + ---------- + feat_with_missing_ : array of shape(n_missing_features, ) + The features with missing values. + Note that this is only stored if features == 'train + + n_features_ : int + The number of features during fit time. """ def __init__(self, missing_values="NaN", features="train", sparse="auto"): @@ -449,23 +447,13 @@ def fit(self, X, y=None): self : object Returns self. """ - if (isinstance(self.features, six.string_types) and - self.features not in ["train", "all"]): - raise ValueError("Can only use these options: 'train', 'all'" - " got {0}".format(self.features)) - elif (isinstance(self.features, np.ndarray) and - not np.issubdtype(self.features.dtype, np.integer)): - raise ValueError("Features should be an array of integers") - - if not ((isinstance(self.sparse, six.string_types) and - self.sparse == "auto") or isinstance(self.sparse, bool)): - raise ValueError("sparse can only use be boolean or 'auto'" - " got {0}".format(self.sparse)) + self._validate_params() X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64) self.n_features_ = X.shape[1] - if self.features == "train": + if (isinstance(self.features, six.string_types) and + self.features == "train"): _, self.feat_with_missing_ = self._get_missing_features_info(X) return self @@ -480,37 +468,85 @@ def transform(self, X): Returns ------- - Xt : {array-like, sparse matrix}, shape = [n_samples, n_features] + Xt : array or sparse matrix, shape = [n_samples, n_features] The missing indicator for input data """ - if self.features == "train": - check_is_fitted(self, "feat_with_missing_") + if (isinstance(self.features, six.string_types) and + self.features == "train"): + check_is_fitted(self, "feat_with_missing_", "n_features_") + else: + check_is_fitted(self, 'n_features_') X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64) if X.shape[1] != self.n_features_: - raise ValueError("X has a different shape than during fitting.") + raise ValueError("X has a different number of features " + "than during fitting.") imputer_mask, feat_with_missing = self._get_missing_features_info(X) - if self.features == "train": - features = np.setdiff1d(feat_with_missing, - self.feat_with_missing_) - if features.size > 0: - warnings.warn("The features %s have missing values " - "in transform but have no missing values " - "in fit " % features, RuntimeWarning, - stacklevel=1) - imputer_mask = imputer_mask[:, self.feat_with_missing_] + if isinstance(self.features, six.string_types): + if self.features == "train": + features = np.setdiff1d(feat_with_missing, + self.feat_with_missing_) + if features.size > 0: + warnings.warn("The features %s have missing values " + "in transform but have no missing values " + "in fit " % features, RuntimeWarning, + stacklevel=1) + imputer_mask = imputer_mask[:, self.feat_with_missing_] - elif self.features == "all": - imputer_mask = imputer_mask + else: # features is array-like + imputer_mask = imputer_mask[:, self.features] + + return imputer_mask + + def fit_transform(self, X): + """Generate missing values indicator for X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + The input data to complete. + + Returns + ------- + Xt : array or sparse matrix, shape = [n_samples, n_features] + The missing indicator for input data + + """ + self._validate_params() + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64) + + imputer_mask, feat_with_missing = self._get_missing_features_info(X) + + self.n_features_ = X.shape[1] + if isinstance(self.features, six.string_types): + if self.features == "train": + self.feat_with_missing_ = feat_with_missing + imputer_mask = imputer_mask[:, self.feat_with_missing_] else: # features is array-like imputer_mask = imputer_mask[:, self.features] return imputer_mask + def _validate_params(self): + if (isinstance(self.features, six.string_types) and + self.features not in ["train", "all"]): + raise ValueError("Can only use these options: 'train', 'all'" + " got {0}".format(self.features)) + elif not isinstance(self.features, six.string_types): + self.features = check_array(self.features, ensure_2d=False) + if (isinstance(self.features, np.ndarray) and + self.features.dtype.kind != 'i'): + raise ValueError("Features should be an array of integers") + + if not ((isinstance(self.sparse, six.string_types) and + self.sparse == "auto") or isinstance(self.sparse, bool)): + raise ValueError("sparse can only boolean or 'auto'" + " got {0}".format(self.sparse)) + def _get_missing_features_info(self, X): if sparse.issparse(X) and self.missing_values != 0: # sparse matrix and missing values is not zero @@ -519,25 +555,19 @@ def _get_missing_features_info(self, X): X.indptr.copy()), shape=X.shape, dtype=X.dtype) feat_with_missing = imputer_mask.sum(axis=0).nonzero()[1] - feat_with_missing = np.ravel(feat_with_missing) - else: # sparse with zero as missing value and dense matrix if sparse.issparse(X): X = X.toarray() imputer_mask = _get_mask(X, self.missing_values) + # convert boolean mask to binary mask imputer_mask = astype(imputer_mask, int, copy=False) - feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0] + feat_with_missing = imputer_mask.sum(axis=0).nonzero()[0] - if self.sparse is True: - if sparse.issparse(imputer_mask): - imputer_mask = imputer_mask.tocsc() - else: - imputer_mask = sparse.csc_matrix(imputer_mask) + if ((self.sparse == 'auto' and sparse.issparse(imputer_mask)) or + self.sparse is True): + imputer_mask = sparse.csc_matrix(imputer_mask) elif self.sparse is False and sparse.issparse(imputer_mask): imputer_mask = imputer_mask.toarray() - elif self.sparse == 'auto' and self.missing_values != 0: - if sparse.issparse(imputer_mask): - imputer_mask = imputer_mask.tocsc() return imputer_mask, feat_with_missing diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index b80cc0204f464..77d98e380eb1b 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -11,7 +11,7 @@ from sklearn.utils.testing import assert_false from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_warns_message - +from sklearn.utils.testing import assert_no_warnings from sklearn.preprocessing.imputation import Imputer from sklearn.preprocessing.imputation import MissingIndicator @@ -415,35 +415,44 @@ def assert_mask(actual, expected, features): assert_array_equal(actual, expected[:, features]) def _check_missing_indicator(X1, X2, retype, sp, missing_values): - mask = X2 == missing_values + mask_X2 = X2 == missing_values + mask_X1 = X1 == missing_values + expect_feat_missing = np.where(np.any(X1 == missing_values, axis=0))[0] X1_in = retype(X1) X2_in = retype(X2) # features = "train": - MI = MissingIndicator(missing_values=missing_values, - sparse=sp) + indicator = MissingIndicator(missing_values=missing_values, sparse=sp) - MI.fit(X1_in) - X2_tr = MI.transform(X2_in) - features = MI.feat_with_missing_ + X1_tr = indicator.fit_transform(X1_in) + X2_tr = indicator.transform(X2_in) + features = indicator.feat_with_missing_ assert_array_equal(expect_feat_missing, features) assert_type(type(X2_tr), sparse.issparse(X2_in), sp, missing_values) - assert_mask(X2_tr, mask, features) + assert_type(type(X1_tr), sparse.issparse(X1_in), sp, missing_values) + assert_mask(X2_tr, mask_X2, features) + assert_mask(X1_tr, mask_X1, features) # features = "all" - MI = clone(MI).set_params(features="all") - MI.fit(X1_in) - X2_tr = MI.transform(X2_in) + indicator = clone(indicator).set_params(features="all") + X1_tr = indicator.fit_transform(X1_in) + X2_tr = indicator.transform(X2_in) features = np.arange(X2.shape[1]) - assert_mask(X2_tr, mask, features) + assert_type(type(X1_tr), sparse.issparse(X1_in), sp, missing_values) + assert_type(type(X2_tr), sparse.issparse(X2_in), sp, missing_values) + assert_mask(X2_tr, mask_X2, features) + assert_mask(X1_tr, mask_X1, features) # features = [1, 2] features = [1, 2] - MI = clone(MI).set_params(features=features) - MI.fit(X1_in) - X2_tr = MI.transform(X2_in) - assert_mask(X2_tr, mask, features) + indicator = clone(indicator).set_params(features=features) + X1_tr = indicator.fit_transform(X1_in) + X2_tr = indicator.transform(X2_in) + assert_type(type(X2_tr), sparse.issparse(X2_in), sp, missing_values) + assert_type(type(X1_tr), sparse.issparse(X1_in), sp, missing_values) + assert_mask(X2_tr, mask_X2, features) + assert_mask(X1_tr, mask_X1, features) for X1, X2, missing_values in [(X1_orig, X2_orig, -1), (X1_orig + 1, X2_orig + 1, 0)]: @@ -464,12 +473,23 @@ def test_missing_indicator_warning(): [-1, 2, 3], [2, 4, 0] ]) - MI = MissingIndicator(missing_values=-1) - MI.fit(X1) - missing_features_fit = np.where(np.any(X1 == -1, axis=0))[0] + indicator = MissingIndicator(missing_values=-1) + indicator.fit(X1) + missing_features_fit = np.sum(X1 == -1, axis=0).nonzero()[0] missing_features_tr = np.where(np.any(X2 == -1, axis=0))[0] extra_missing_features = np.setdiff1d(missing_features_tr, missing_features_fit) warn_msg = "The features %s have missing values in transform " \ "but have no missing values in fit" % extra_missing_features - assert_warns_message(RuntimeWarning, warn_msg, MI.transform, X2) + assert_warns_message(RuntimeWarning, warn_msg, indicator.transform, X2) + + # features = "all" + indicator = clone(indicator).set_params(features="all") + indicator.fit(X1) + assert_no_warnings(indicator.transform, X2) + + # features = [0, 2] + features = [0, 2] + indicator = clone(indicator).set_params(features=features) + indicator.fit(X1) + assert_no_warnings(indicator.transform, X2) From 552a2cbb3767cdd6e91287e7c7e9fcb4ff7e2cb7 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Mon, 6 Mar 2017 23:27:24 +0530 Subject: [PATCH 11/49] modify doc [ci skip] --- doc/modules/preprocessing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 605bbb76756af..fb876c45ba55c 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -607,8 +607,8 @@ The knowledge of which features were imputed can be exploited by a downstream estimator by adding features that indicate which elements have been imputed. The ``features`` attribute is used to choose the features for which the mask is constructed. -By default, the binary matrix has only features with at least one missing value. -In case it mentioned as *all* the matrix has all the features in the input +By default, it is *train* which means the binary matrix has features with missing values +during fit time. In case it mentioned as *all* the matrix has all the features. >>> from sklearn.preprocessing import MissingIndicator >>> import numpy as np From 0d980e45fe293ad4ccbf2f2c95589f65e0ca4684 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Tue, 7 Mar 2017 22:55:06 +0530 Subject: [PATCH 12/49] fix failing tests --- sklearn/preprocessing/imputation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index cafcaf120344c..5bd1f3319ceac 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -501,7 +501,7 @@ def transform(self, X): return imputer_mask - def fit_transform(self, X): + def fit_transform(self, X, y=None): """Generate missing values indicator for X. Parameters From a1a69822025e726ce468038dc0f91f6c6cef38f6 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Mon, 5 Jun 2017 20:12:40 +0530 Subject: [PATCH 13/49] Change default to np.NaN --- doc/modules/preprocessing.rst | 15 +++------ sklearn/preprocessing/imputation.py | 33 +++++++++---------- .../preprocessing/tests/test_imputation.py | 22 ++++++------- 3 files changed, 31 insertions(+), 39 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index fb876c45ba55c..881e4c2cc6caa 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -603,12 +603,12 @@ Imputation of missing values :class:`MissingIndicator` transformer is useful to transform a dataset into corresponding binary matrix indicating the presence of missing values in the dataset. -The knowledge of which features were imputed can be exploited by a downstream -estimator by adding features that indicate which elements have been imputed. +This transformation is useful in conjunction with imputation. When using imputation, +preserving the information which values have been imputed can be informative. -The ``features`` attribute is used to choose the features for which the mask is constructed. -By default, it is *train* which means the binary matrix has features with missing values -during fit time. In case it mentioned as *all* the matrix has all the features. +The ``features`` parameter is used to choose the features for which the mask is constructed. +By default, it is 'train' which means the binary matrix has features with missing values +during fit time. When it is *all* the matrix has all the features. >>> from sklearn.preprocessing import MissingIndicator >>> import numpy as np @@ -617,11 +617,6 @@ during fit time. In case it mentioned as *all* the matrix has all the features. ... [ 4, -1, 0, -1], ... [ 8, -1, 1, 0], ... ]) - >>> X2 = np.array([ - ... [ 5, 1, -1, -1], - ... [-1, -1, 2, 3], - ... [ 2, 3, 4, 0], - ... ]) >>> indicator = MissingIndicator(missing_values=-1) >>> X1_tr = indicator.fit_transform(X1) >>> X1_tr diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 5bd1f3319ceac..2bf44b87c46e0 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -374,14 +374,13 @@ def transform(self, X): class MissingIndicator(BaseEstimator, TransformerMixin): - """Missing values indicator transformer for indicating missing values. + """Binary indicators for missing values. Parameters ---------- - missing_values : integer or "NaN", optional (default="NaN") + missing_values : integer or np.NaN, optional (default=np.NaN) The placeholder for the missing values. All occurrences of - `missing_values` will be imputed. For missing values encoded as np.nan, - use the string value "NaN". + ``missing_values`` will be imputed. features : {'train' (default), 'all', array-like of int} If "all", mask will represent all features. @@ -400,18 +399,18 @@ class MissingIndicator(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import MissingIndicator >>> import numpy as np >>> X1 = np.array([ - ... [-1, 1, 3], - ... [ 4, 0, -1], + ... [np.NaN, 1, 3], + ... [ 4, 0, np.NaN], ... [ 8, 1, 0] ... ]) >>> X2 = np.array([ - ... [ 5, -1, -1], - ... [-1, 2, 3], + ... [ 5, np.NaN, np.NaN], + ... [np.NaN, 2, 3], ... [ 2, 4, 0] ... ]) - >>> indicator = MissingIndicator(missing_values=-1) + >>> indicator = MissingIndicator(missing_values=np.NaN) >>> indicator.fit(X1) - MissingIndicator(features='train', missing_values=-1, sparse='auto') + MissingIndicator(features='train', missing_values=nan, sparse='auto') >>> X2_tr = indicator.transform(X2) >>> X2_tr array([[0, 1], @@ -420,15 +419,15 @@ class MissingIndicator(BaseEstimator, TransformerMixin): Attributes ---------- - feat_with_missing_ : array of shape(n_missing_features, ) + feat_with_missing_ : array of shape(n_missing_features,) The features with missing values. - Note that this is only stored if features == 'train + Note that this is only stored if features == 'train' n_features_ : int - The number of features during fit time. + The number of features in the input. """ - def __init__(self, missing_values="NaN", features="train", sparse="auto"): + def __init__(self, missing_values=np.NaN, features="train", sparse="auto"): self.missing_values = missing_values self.features = features self.sparse = sparse @@ -449,7 +448,7 @@ def fit(self, X, y=None): """ self._validate_params() - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64) + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) self.n_features_ = X.shape[1] if (isinstance(self.features, six.string_types) and @@ -478,7 +477,7 @@ def transform(self, X): else: check_is_fitted(self, 'n_features_') - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64) + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) if X.shape[1] != self.n_features_: raise ValueError("X has a different number of features " "than during fitting.") @@ -516,7 +515,7 @@ def fit_transform(self, X, y=None): """ self._validate_params() - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64) + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) imputer_mask, feat_with_missing = self._get_missing_features_info(X) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 77d98e380eb1b..3a07572fde4f3 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -383,18 +383,18 @@ def test_imputation_copy(): def test_missing_indicator(): X1_orig = np.array([ - [-1, -1, 1, 3], - [4, -1, 0, -1], - [8, -1, 1, 0], - [0, -1, 0, 15], - [16, -1, 1, 19] + [-1, -1, 1, 3], + [4, -1, 0, -1], + [8, -1, 1, 0], + [0, -1, 0, 15], + [16, -1, 1, 19] ]) X2_orig = np.array([ - [5, 1, 1, -1], - [-1, -1, 2, 3], - [2, 3, 4, 0], - [0, -1, 5, -1], - [11, -1, 1, 1] + [5, 1, 1, -1], + [-1, -1, 2, 3], + [2, 3, 4, 0], + [0, -1, 5, -1], + [11, -1, 1, 1] ]) def assert_type(actual, is_sparse, sp, missing_values): @@ -424,7 +424,6 @@ def _check_missing_indicator(X1, X2, retype, sp, missing_values): X2_in = retype(X2) # features = "train": indicator = MissingIndicator(missing_values=missing_values, sparse=sp) - X1_tr = indicator.fit_transform(X1_in) X2_tr = indicator.transform(X2_in) features = indicator.feat_with_missing_ @@ -444,7 +443,6 @@ def _check_missing_indicator(X1, X2, retype, sp, missing_values): assert_mask(X2_tr, mask_X2, features) assert_mask(X1_tr, mask_X1, features) - # features = [1, 2] features = [1, 2] indicator = clone(indicator).set_params(features=features) X1_tr = indicator.fit_transform(X1_in) From 91b012225e6a9230ec5e2ee0b316ff53744405ca Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Tue, 6 Jun 2017 20:48:15 +0530 Subject: [PATCH 14/49] Error when transform has features with missing values while not during fit --- sklearn/preprocessing/imputation.py | 56 +++++++++++-------- .../preprocessing/tests/test_imputation.py | 13 +++-- 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 2bf44b87c46e0..50bbb1a217974 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -394,6 +394,10 @@ class MissingIndicator(BaseEstimator, TransformerMixin): If False, the transformed ``X`` will be a numpy array. If "auto", the transformed ``X`` will be of same type as input. + strict : boolean, optional (default=True) + If True, transform will raise an error when there are features with + missing values in transform but have no missing values in fit + Example ------- >>> from sklearn.preprocessing import MissingIndicator @@ -404,13 +408,14 @@ class MissingIndicator(BaseEstimator, TransformerMixin): ... [ 8, 1, 0] ... ]) >>> X2 = np.array([ - ... [ 5, np.NaN, np.NaN], + ... [ 5, 1, np.NaN], ... [np.NaN, 2, 3], ... [ 2, 4, 0] ... ]) >>> indicator = MissingIndicator(missing_values=np.NaN) >>> indicator.fit(X1) - MissingIndicator(features='train', missing_values=nan, sparse='auto') + MissingIndicator(features='train', missing_values=nan, sparse='auto', + strict=True) >>> X2_tr = indicator.transform(X2) >>> X2_tr array([[0, 1], @@ -421,16 +426,16 @@ class MissingIndicator(BaseEstimator, TransformerMixin): ---------- feat_with_missing_ : array of shape(n_missing_features,) The features with missing values. - Note that this is only stored if features == 'train' n_features_ : int The number of features in the input. """ - def __init__(self, missing_values=np.NaN, features="train", sparse="auto"): + def __init__(self, missing_values=np.NaN, features="train", sparse="auto", strict=True): self.missing_values = missing_values self.features = features self.sparse = sparse + self.strict = strict def fit(self, X, y=None): """Fit the transformer on X. @@ -451,9 +456,13 @@ def fit(self, X, y=None): X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) self.n_features_ = X.shape[1] - if (isinstance(self.features, six.string_types) and - self.features == "train"): - _, self.feat_with_missing_ = self._get_missing_features_info(X) + if isinstance(self.features, six.string_types): + if self.features == "train": + _, self.feat_with_missing_ = self._get_missing_features_info(X) + else : # self.features == "all" + self.feat_with_missing_ = np.arange(self.n_features_) + else: + self.feat_with_missing_ = self.features return self @@ -471,11 +480,7 @@ def transform(self, X): The missing indicator for input data """ - if (isinstance(self.features, six.string_types) and - self.features == "train"): - check_is_fitted(self, "feat_with_missing_", "n_features_") - else: - check_is_fitted(self, 'n_features_') + check_is_fitted(self, "feat_with_missing_", "n_features_") X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) if X.shape[1] != self.n_features_: @@ -488,15 +493,14 @@ def transform(self, X): if self.features == "train": features = np.setdiff1d(feat_with_missing, self.feat_with_missing_) - if features.size > 0: - warnings.warn("The features %s have missing values " - "in transform but have no missing values " - "in fit " % features, RuntimeWarning, - stacklevel=1) - imputer_mask = imputer_mask[:, self.feat_with_missing_] + if self.strict and features.size > 0: + raise Exception("The features %s have missing values " + "in transform but have no missing values " + "in fit" % features) - else: # features is array-like - imputer_mask = imputer_mask[:, self.features] + if not (isinstance(self.features, six.string_types) and self.features == "all"): + # no need to slice when all features have missing values + imputer_mask = imputer_mask[:, self.feat_with_missing_] return imputer_mask @@ -523,10 +527,14 @@ def fit_transform(self, X, y=None): if isinstance(self.features, six.string_types): if self.features == "train": self.feat_with_missing_ = feat_with_missing - imputer_mask = imputer_mask[:, self.feat_with_missing_] + else : # self.features == "all" + self.feat_with_missing_ = np.arange(self.n_features_) + else : + self.feat_with_missing_ = self.features - else: # features is array-like - imputer_mask = imputer_mask[:, self.features] + if not (isinstance(self.features, six.string_types) and self.features == "all"): + # no need to slice when all features have missing values + imputer_mask = imputer_mask[:, self.feat_with_missing_] return imputer_mask @@ -560,7 +568,7 @@ def _get_missing_features_info(self, X): X = X.toarray() imputer_mask = _get_mask(X, self.missing_values) # convert boolean mask to binary mask - imputer_mask = astype(imputer_mask, int, copy=False) + imputer_mask = imputer_mask.astype(int, copy=False) feat_with_missing = imputer_mask.sum(axis=0).nonzero()[0] if ((self.sparse == 'auto' and sparse.issparse(imputer_mask)) or diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 3a07572fde4f3..5f8084f20c34c 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -12,6 +12,7 @@ from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_no_warnings +from sklearn.utils.testing import assert_raises_regexp from sklearn.preprocessing.imputation import Imputer from sklearn.preprocessing.imputation import MissingIndicator @@ -460,7 +461,7 @@ def _check_missing_indicator(X1, X2, retype, sp, missing_values): _check_missing_indicator(X1, X2, retype, sp, missing_values) -def test_missing_indicator_warning(): +def test_missing_indicator_error(): X1 = np.array([ [-1, 1, 3], [4, 0, -1], @@ -477,17 +478,17 @@ def test_missing_indicator_warning(): missing_features_tr = np.where(np.any(X2 == -1, axis=0))[0] extra_missing_features = np.setdiff1d(missing_features_tr, missing_features_fit) - warn_msg = "The features %s have missing values in transform " \ - "but have no missing values in fit" % extra_missing_features - assert_warns_message(RuntimeWarning, warn_msg, indicator.transform, X2) + err_msg = "The features \%s have missing values in transform " \ + "but have no missing values in fit" % extra_missing_features + assert_raises_regexp(Exception, err_msg, indicator.transform, X2) # features = "all" indicator = clone(indicator).set_params(features="all") indicator.fit(X1) - assert_no_warnings(indicator.transform, X2) + indicator.transform(X2) # features = [0, 2] features = [0, 2] indicator = clone(indicator).set_params(features=features) indicator.fit(X1) - assert_no_warnings(indicator.transform, X2) + indicator.transform(X2) From 4137ed315424a881e4819b2c3b515f45ad64bd42 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Wed, 19 Jul 2017 20:30:31 +0530 Subject: [PATCH 15/49] Doc and test changes --- doc/modules/preprocessing.rst | 4 +- sklearn/preprocessing/imputation.py | 2 + .../preprocessing/tests/test_imputation.py | 62 +++++++++++-------- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 881e4c2cc6caa..57def07f6df16 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -604,11 +604,11 @@ Imputation of missing values :class:`MissingIndicator` transformer is useful to transform a dataset into corresponding binary matrix indicating the presence of missing values in the dataset. This transformation is useful in conjunction with imputation. When using imputation, -preserving the information which values have been imputed can be informative. +preserving the information about which values have been imputed can be informative. The ``features`` parameter is used to choose the features for which the mask is constructed. By default, it is 'train' which means the binary matrix has features with missing values -during fit time. When it is *all* the matrix has all the features. +during fit time. When it is 'all' the matrix has all the features. >>> from sklearn.preprocessing import MissingIndicator >>> import numpy as np diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 50bbb1a217974..cfba654de8b08 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -398,6 +398,8 @@ class MissingIndicator(BaseEstimator, TransformerMixin): If True, transform will raise an error when there are features with missing values in transform but have no missing values in fit + This is applicable only when ``features`` = "train" + Example ------- >>> from sklearn.preprocessing import MissingIndicator diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 5f8084f20c34c..8bcc8c5c0d764 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -382,29 +382,37 @@ def test_imputation_copy(): # made, even if copy=False. +def _get_mask(X, value_to_mask): + """Compute the boolean mask X == missing_values.""" + if value_to_mask == "NaN" or np.isnan(value_to_mask): + return np.isnan(X) + else: + return X == value_to_mask + + def test_missing_indicator(): X1_orig = np.array([ - [-1, -1, 1, 3], - [4, -1, 0, -1], - [8, -1, 1, 0], - [0, -1, 0, 15], - [16, -1, 1, 19] + [np.nan, np.nan, 1, 3], + [4, np.nan, 0, np.nan], + [8, np.nan, 1, 0], + [0, np.nan, 0, 15], + [16, np.nan, 1, 19] ]) X2_orig = np.array([ - [5, 1, 1, -1], - [-1, -1, 2, 3], + [5, 1, 1, np.nan], + [np.nan, np.nan, 2, 3], [2, 3, 4, 0], - [0, -1, 5, -1], - [11, -1, 1, 1] + [0, np.nan, 5, np.nan], + [11, np.nan, 1, 1] ]) - def assert_type(actual, is_sparse, sp, missing_values): - if sp is True: + def assert_type(actual, X_is_sparse, sparse_param, missing_values): + if sparse_param is True: assert_equal(actual, sparse.csc_matrix) - elif ((sp is "auto" and missing_values == 0) or sp is False): + elif ((sparse_param == "auto" and missing_values == 0) or sparse_param is False): assert_equal(actual, np.ndarray) else: - if is_sparse: + if X_is_sparse: assert_equal(actual, sparse.csc_matrix) else: assert_equal(actual, np.ndarray) @@ -415,22 +423,22 @@ def assert_mask(actual, expected, features): else: assert_array_equal(actual, expected[:, features]) - def _check_missing_indicator(X1, X2, retype, sp, missing_values): - mask_X2 = X2 == missing_values - mask_X1 = X1 == missing_values + def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): + mask_X2 = _get_mask(X2, missing_values) + mask_X1 = _get_mask(X1, missing_values) - expect_feat_missing = np.where(np.any(X1 == missing_values, axis=0))[0] + expect_feat_missing = np.where(np.any(mask_X1, axis=0))[0] X1_in = retype(X1) X2_in = retype(X2) # features = "train": - indicator = MissingIndicator(missing_values=missing_values, sparse=sp) + indicator = MissingIndicator(missing_values=missing_values, sparse=sparse_param) X1_tr = indicator.fit_transform(X1_in) X2_tr = indicator.transform(X2_in) features = indicator.feat_with_missing_ assert_array_equal(expect_feat_missing, features) - assert_type(type(X2_tr), sparse.issparse(X2_in), sp, missing_values) - assert_type(type(X1_tr), sparse.issparse(X1_in), sp, missing_values) + assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, missing_values) + assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, missing_values) assert_mask(X2_tr, mask_X2, features) assert_mask(X1_tr, mask_X1, features) @@ -439,8 +447,8 @@ def _check_missing_indicator(X1, X2, retype, sp, missing_values): X1_tr = indicator.fit_transform(X1_in) X2_tr = indicator.transform(X2_in) features = np.arange(X2.shape[1]) - assert_type(type(X1_tr), sparse.issparse(X1_in), sp, missing_values) - assert_type(type(X2_tr), sparse.issparse(X2_in), sp, missing_values) + assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, missing_values) + assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, missing_values) assert_mask(X2_tr, mask_X2, features) assert_mask(X1_tr, mask_X1, features) @@ -448,17 +456,17 @@ def _check_missing_indicator(X1, X2, retype, sp, missing_values): indicator = clone(indicator).set_params(features=features) X1_tr = indicator.fit_transform(X1_in) X2_tr = indicator.transform(X2_in) - assert_type(type(X2_tr), sparse.issparse(X2_in), sp, missing_values) - assert_type(type(X1_tr), sparse.issparse(X1_in), sp, missing_values) + assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, missing_values) + assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, missing_values) assert_mask(X2_tr, mask_X2, features) assert_mask(X1_tr, mask_X1, features) - for X1, X2, missing_values in [(X1_orig, X2_orig, -1), + for X1, X2, missing_values in [(X1_orig, X2_orig, np.nan), (X1_orig + 1, X2_orig + 1, 0)]: for retype in [lambda x: x.tolist(), np.array, sparse.csr_matrix, sparse.csc_matrix, sparse.lil_matrix]: - for sp in [True, False, 'auto']: - _check_missing_indicator(X1, X2, retype, sp, missing_values) + for sparse_param in [True, False, 'auto']: + _check_missing_indicator(X1, X2, retype, sparse_param, missing_values) def test_missing_indicator_error(): From fb3d55a66d3f93557283d2c9dbc6e021bf58d390 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Thu, 17 Aug 2017 20:09:13 +0530 Subject: [PATCH 16/49] Documentation changes and remove duplicate code --- doc/modules/preprocessing.rst | 4 +- sklearn/preprocessing/imputation.py | 55 +++++++------------ .../preprocessing/tests/test_imputation.py | 12 +--- 3 files changed, 23 insertions(+), 48 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 57def07f6df16..9e2a02859ab40 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -601,13 +601,13 @@ Tools for imputing missing values are discussed at :ref:`impute`. Imputation of missing values ===================================== -:class:`MissingIndicator` transformer is useful to transform a dataset into corresponding +The :class:`MissingIndicator` transformer is useful to transform a dataset into corresponding binary matrix indicating the presence of missing values in the dataset. This transformation is useful in conjunction with imputation. When using imputation, preserving the information about which values have been imputed can be informative. The ``features`` parameter is used to choose the features for which the mask is constructed. -By default, it is 'train' which means the binary matrix has features with missing values +By default, it is 'auto' which means the binary matrix has features with missing values during fit time. When it is 'all' the matrix has all the features. >>> from sklearn.preprocessing import MissingIndicator diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index cfba654de8b08..1074c0fb0950d 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -380,11 +380,11 @@ class MissingIndicator(BaseEstimator, TransformerMixin): ---------- missing_values : integer or np.NaN, optional (default=np.NaN) The placeholder for the missing values. All occurrences of - ``missing_values`` will be imputed. + ``missing_values`` will be represented as ones. - features : {'train' (default), 'all', array-like of int} + features : {'auto' (default), 'all', array-like of int} If "all", mask will represent all features. - If "train", mask will only represent features with missing values + If "auto", mask will only represent features with missing values during fit time. If mask/indices, mask will only represent features in the indices or mask. @@ -394,11 +394,10 @@ class MissingIndicator(BaseEstimator, TransformerMixin): If False, the transformed ``X`` will be a numpy array. If "auto", the transformed ``X`` will be of same type as input. - strict : boolean, optional (default=True) + error_on_new : boolean, optional (default=True) If True, transform will raise an error when there are features with missing values in transform but have no missing values in fit - - This is applicable only when ``features`` = "train" + This is applicable only when ``features`` = "auto" Example ------- @@ -414,10 +413,10 @@ class MissingIndicator(BaseEstimator, TransformerMixin): ... [np.NaN, 2, 3], ... [ 2, 4, 0] ... ]) - >>> indicator = MissingIndicator(missing_values=np.NaN) + >>> indicator = MissingIndicator() >>> indicator.fit(X1) - MissingIndicator(features='train', missing_values=nan, sparse='auto', - strict=True) + MissingIndicator(error_on_new=True, features='auto', missing_values=nan, + sparse='auto') >>> X2_tr = indicator.transform(X2) >>> X2_tr array([[0, 1], @@ -433,11 +432,12 @@ class MissingIndicator(BaseEstimator, TransformerMixin): The number of features in the input. """ - def __init__(self, missing_values=np.NaN, features="train", sparse="auto", strict=True): + def __init__(self, missing_values=np.NaN, features="auto", sparse="auto", + error_on_new=True): self.missing_values = missing_values self.features = features self.sparse = sparse - self.strict = strict + self.error_on_new = error_on_new def fit(self, X, y=None): """Fit the transformer on X. @@ -459,9 +459,9 @@ def fit(self, X, y=None): self.n_features_ = X.shape[1] if isinstance(self.features, six.string_types): - if self.features == "train": + if self.features == "auto": _, self.feat_with_missing_ = self._get_missing_features_info(X) - else : # self.features == "all" + else: # self.features == "all" self.feat_with_missing_ = np.arange(self.n_features_) else: self.feat_with_missing_ = self.features @@ -492,10 +492,10 @@ def transform(self, X): imputer_mask, feat_with_missing = self._get_missing_features_info(X) if isinstance(self.features, six.string_types): - if self.features == "train": + if self.features == "auto": features = np.setdiff1d(feat_with_missing, self.feat_with_missing_) - if self.strict and features.size > 0: + if self.error_on_new and features.size > 0: raise Exception("The features %s have missing values " "in transform but have no missing values " "in fit" % features) @@ -520,30 +520,13 @@ def fit_transform(self, X, y=None): The missing indicator for input data """ - self._validate_params() - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) - - imputer_mask, feat_with_missing = self._get_missing_features_info(X) - - self.n_features_ = X.shape[1] - if isinstance(self.features, six.string_types): - if self.features == "train": - self.feat_with_missing_ = feat_with_missing - else : # self.features == "all" - self.feat_with_missing_ = np.arange(self.n_features_) - else : - self.feat_with_missing_ = self.features - - if not (isinstance(self.features, six.string_types) and self.features == "all"): - # no need to slice when all features have missing values - imputer_mask = imputer_mask[:, self.feat_with_missing_] - - return imputer_mask + self.fit(X, y) + return self.transform(X) def _validate_params(self): if (isinstance(self.features, six.string_types) and - self.features not in ["train", "all"]): - raise ValueError("Can only use these options: 'train', 'all'" + self.features not in ["auto", "all"]): + raise ValueError("Can only use these options: 'auto', 'all'" " got {0}".format(self.features)) elif not isinstance(self.features, six.string_types): self.features = check_array(self.features, ensure_2d=False) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 8bcc8c5c0d764..9b2483eb9afda 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -20,7 +20,7 @@ from sklearn.model_selection import GridSearchCV from sklearn import tree from sklearn.random_projection import sparse_random_matrix - +from sklearn.preprocessing.imputation import _get_mask @ignore_warnings def _check_statistics(X, X_true, @@ -382,14 +382,6 @@ def test_imputation_copy(): # made, even if copy=False. -def _get_mask(X, value_to_mask): - """Compute the boolean mask X == missing_values.""" - if value_to_mask == "NaN" or np.isnan(value_to_mask): - return np.isnan(X) - else: - return X == value_to_mask - - def test_missing_indicator(): X1_orig = np.array([ [np.nan, np.nan, 1, 3], @@ -431,7 +423,7 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): X1_in = retype(X1) X2_in = retype(X2) - # features = "train": + # features = "auto": indicator = MissingIndicator(missing_values=missing_values, sparse=sparse_param) X1_tr = indicator.fit_transform(X1_in) X2_tr = indicator.transform(X2_in) From 500aa6532ffc5424fbb950d5718401e193ff5ee4 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Mon, 21 Aug 2017 18:02:31 +0530 Subject: [PATCH 17/49] fix tests --- sklearn/preprocessing/imputation.py | 2 ++ sklearn/utils/estimator_checks.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 1074c0fb0950d..dd0713504d537 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -493,6 +493,8 @@ def transform(self, X): if isinstance(self.features, six.string_types): if self.features == "auto": + print feat_with_missing + print self.feat_with_missing_ features = np.setdiff1d(feat_with_missing, self.feat_with_missing_) if self.error_on_new and features.size > 0: diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 9a321e914b238..bbffa1c32d2ae 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -93,7 +93,7 @@ def _yield_non_meta_checks(name, estimator): # cross-decomposition's "transform" returns X and Y yield check_pipeline_consistency - if name not in ['SimpleImputer', 'Imputer']: + if name not in ['SimpleImputer', 'Imputer', 'MissingIndicator']: # Test that all estimators check their input for NaN's and infs yield check_estimators_nan_inf From 3e7c4c15b26997407f2cfe4764c4d109e653acc4 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Fri, 25 Aug 2017 12:29:49 +0530 Subject: [PATCH 18/49] fix estimator common tests --- doc/modules/preprocessing.rst | 2 +- sklearn/preprocessing/imputation.py | 12 ++++++------ sklearn/preprocessing/tests/test_imputation.py | 1 + 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 9e2a02859ab40..05a4b641bea62 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -604,7 +604,7 @@ Imputation of missing values The :class:`MissingIndicator` transformer is useful to transform a dataset into corresponding binary matrix indicating the presence of missing values in the dataset. This transformation is useful in conjunction with imputation. When using imputation, -preserving the information about which values have been imputed can be informative. +preserving the information about which values had been missing can be informative. The ``features`` parameter is used to choose the features for which the mask is constructed. By default, it is 'auto' which means the binary matrix has features with missing values diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index dd0713504d537..5bed1daa4d007 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -404,18 +404,18 @@ class MissingIndicator(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import MissingIndicator >>> import numpy as np >>> X1 = np.array([ - ... [np.NaN, 1, 3], - ... [ 4, 0, np.NaN], + ... ["NaN", 1, 3], + ... [ 4, 0, "NaN"], ... [ 8, 1, 0] ... ]) >>> X2 = np.array([ - ... [ 5, 1, np.NaN], - ... [np.NaN, 2, 3], + ... [ 5, 1, "NaN"], + ... ["NaN", 2, 3], ... [ 2, 4, 0] ... ]) >>> indicator = MissingIndicator() >>> indicator.fit(X1) - MissingIndicator(error_on_new=True, features='auto', missing_values=nan, + MissingIndicator(error_on_new=True, features='auto', missing_values='NaN', sparse='auto') >>> X2_tr = indicator.transform(X2) >>> X2_tr @@ -432,7 +432,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): The number of features in the input. """ - def __init__(self, missing_values=np.NaN, features="auto", sparse="auto", + def __init__(self, missing_values="NaN", features="auto", sparse="auto", error_on_new=True): self.missing_values = missing_values self.features = features diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 9b2483eb9afda..a81a1cb87ba76 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -416,6 +416,7 @@ def assert_mask(actual, expected, features): assert_array_equal(actual, expected[:, features]) def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): + print retype, sparse_param mask_X2 = _get_mask(X2, missing_values) mask_X1 = _get_mask(X1, missing_values) From 426d1791bfef62bbf3f82c0522a31c7419651335 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Fri, 25 Aug 2017 12:57:42 +0530 Subject: [PATCH 19/49] fix sparse array tests --- sklearn/preprocessing/imputation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 5bed1daa4d007..d3cf24a222c0b 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -564,4 +564,5 @@ def _get_missing_features_info(self, X): elif self.sparse is False and sparse.issparse(imputer_mask): imputer_mask = imputer_mask.toarray() + feat_with_missing = feat_with_missing.ravel() return imputer_mask, feat_with_missing From 9b0e9bed7201ebd1b9d8796bc08f9c0c95b2f964 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Fri, 25 Aug 2017 13:22:52 +0530 Subject: [PATCH 20/49] fix sparse array tests --- sklearn/preprocessing/imputation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index d3cf24a222c0b..fec78a3bdb587 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -564,5 +564,9 @@ def _get_missing_features_info(self, X): elif self.sparse is False and sparse.issparse(imputer_mask): imputer_mask = imputer_mask.toarray() + print 'before' + print feat_with_missing feat_with_missing = feat_with_missing.ravel() + print 'after' + print feat_with_missing return imputer_mask, feat_with_missing From c45ad5f236f81f5a1377bc13ee9ac05f3fd6e0d0 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Fri, 25 Aug 2017 13:42:15 +0530 Subject: [PATCH 21/49] fix sparse array tests --- sklearn/preprocessing/imputation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index fec78a3bdb587..48a27640e7e9f 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -566,7 +566,7 @@ def _get_missing_features_info(self, X): print 'before' print feat_with_missing - feat_with_missing = feat_with_missing.ravel() + feat_with_missing = feat_with_missing.flatten() print 'after' print feat_with_missing return imputer_mask, feat_with_missing From cc23f131c0a1ada591aeb7a09d6b6611e3406c63 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Fri, 25 Aug 2017 14:06:22 +0530 Subject: [PATCH 22/49] fix sparse array tests --- sklearn/preprocessing/imputation.py | 19 +++++------ .../preprocessing/tests/test_imputation.py | 34 ++++++++++++------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 48a27640e7e9f..8a75b153edb95 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -455,7 +455,8 @@ def fit(self, X, y=None): """ self._validate_params() - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, + force_all_finite=False) self.n_features_ = X.shape[1] if isinstance(self.features, six.string_types): @@ -484,7 +485,8 @@ def transform(self, X): """ check_is_fitted(self, "feat_with_missing_", "n_features_") - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, + force_all_finite=False) if X.shape[1] != self.n_features_: raise ValueError("X has a different number of features " "than during fitting.") @@ -493,8 +495,6 @@ def transform(self, X): if isinstance(self.features, six.string_types): if self.features == "auto": - print feat_with_missing - print self.feat_with_missing_ features = np.setdiff1d(feat_with_missing, self.feat_with_missing_) if self.error_on_new and features.size > 0: @@ -502,7 +502,8 @@ def transform(self, X): "in transform but have no missing values " "in fit" % features) - if not (isinstance(self.features, six.string_types) and self.features == "all"): + if not (isinstance(self.features, six.string_types) and + self.features == "all") and len(self.feat_with_missing_) != 0: # no need to slice when all features have missing values imputer_mask = imputer_mask[:, self.feat_with_missing_] @@ -564,9 +565,7 @@ def _get_missing_features_info(self, X): elif self.sparse is False and sparse.issparse(imputer_mask): imputer_mask = imputer_mask.toarray() - print 'before' - print feat_with_missing - feat_with_missing = feat_with_missing.flatten() - print 'after' - print feat_with_missing + if isinstance(feat_with_missing, np.matrix): + feat_with_missing = feat_with_missing.A1 + return imputer_mask, feat_with_missing diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index a81a1cb87ba76..e6b9c7dea37c3 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -401,7 +401,8 @@ def test_missing_indicator(): def assert_type(actual, X_is_sparse, sparse_param, missing_values): if sparse_param is True: assert_equal(actual, sparse.csc_matrix) - elif ((sparse_param == "auto" and missing_values == 0) or sparse_param is False): + elif ((sparse_param == "auto" and missing_values == 0) or + sparse_param is False): assert_equal(actual, np.ndarray) else: if X_is_sparse: @@ -410,13 +411,14 @@ def assert_type(actual, X_is_sparse, sparse_param, missing_values): assert_equal(actual, np.ndarray) def assert_mask(actual, expected, features): + if len(features) != 0: + expected = expected[:, features] if hasattr(actual, 'toarray'): - assert_array_equal(actual.toarray(), expected[:, features]) + assert_array_equal(actual.toarray(), expected) else: - assert_array_equal(actual, expected[:, features]) + assert_array_equal(actual, expected) def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): - print retype, sparse_param mask_X2 = _get_mask(X2, missing_values) mask_X1 = _get_mask(X1, missing_values) @@ -425,13 +427,16 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): X1_in = retype(X1) X2_in = retype(X2) # features = "auto": - indicator = MissingIndicator(missing_values=missing_values, sparse=sparse_param) + indicator = MissingIndicator(missing_values=missing_values, + sparse=sparse_param) X1_tr = indicator.fit_transform(X1_in) X2_tr = indicator.transform(X2_in) features = indicator.feat_with_missing_ assert_array_equal(expect_feat_missing, features) - assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, missing_values) - assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, missing_values) + assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, + missing_values) + assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, + missing_values) assert_mask(X2_tr, mask_X2, features) assert_mask(X1_tr, mask_X1, features) @@ -440,8 +445,10 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): X1_tr = indicator.fit_transform(X1_in) X2_tr = indicator.transform(X2_in) features = np.arange(X2.shape[1]) - assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, missing_values) - assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, missing_values) + assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, + missing_values) + assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, + missing_values) assert_mask(X2_tr, mask_X2, features) assert_mask(X1_tr, mask_X1, features) @@ -449,8 +456,10 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): indicator = clone(indicator).set_params(features=features) X1_tr = indicator.fit_transform(X1_in) X2_tr = indicator.transform(X2_in) - assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, missing_values) - assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, missing_values) + assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, + missing_values) + assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, + missing_values) assert_mask(X2_tr, mask_X2, features) assert_mask(X1_tr, mask_X1, features) @@ -459,7 +468,8 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): for retype in [lambda x: x.tolist(), np.array, sparse.csr_matrix, sparse.csc_matrix, sparse.lil_matrix]: for sparse_param in [True, False, 'auto']: - _check_missing_indicator(X1, X2, retype, sparse_param, missing_values) + _check_missing_indicator(X1, X2, retype, sparse_param, + missing_values) def test_missing_indicator_error(): From f50d649606c33b0460e766adadfcaef5758fd0ab Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Fri, 12 Jan 2018 17:59:36 +0530 Subject: [PATCH 23/49] address comments and exception tests --- sklearn/preprocessing/imputation.py | 38 ++++++------- .../preprocessing/tests/test_imputation.py | 57 +++++++++++-------- 2 files changed, 52 insertions(+), 43 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 8a75b153edb95..592d735e7879d 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -404,13 +404,13 @@ class MissingIndicator(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import MissingIndicator >>> import numpy as np >>> X1 = np.array([ - ... ["NaN", 1, 3], - ... [ 4, 0, "NaN"], + ... [np.nan, 1, 3], + ... [ 4, 0, np.nan], ... [ 8, 1, 0] ... ]) >>> X2 = np.array([ - ... [ 5, 1, "NaN"], - ... ["NaN", 2, 3], + ... [ 5, 1, np.nan], + ... [np.nan, 2, 3], ... [ 2, 4, 0] ... ]) >>> indicator = MissingIndicator() @@ -425,7 +425,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): Attributes ---------- - feat_with_missing_ : array of shape(n_missing_features,) + features_ : array of shape(n_missing_features,) The features with missing values. n_features_ : int @@ -455,17 +455,17 @@ def fit(self, X, y=None): """ self._validate_params() - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, + X = check_array(X, accept_sparse=('csc', 'csr'), force_all_finite=False) self.n_features_ = X.shape[1] if isinstance(self.features, six.string_types): if self.features == "auto": - _, self.feat_with_missing_ = self._get_missing_features_info(X) + _, self.features_ = self._get_missing_features_info(X) else: # self.features == "all" - self.feat_with_missing_ = np.arange(self.n_features_) + self.features_ = np.arange(self.n_features_) else: - self.feat_with_missing_ = self.features + self.features_ = self.features return self @@ -483,29 +483,29 @@ def transform(self, X): The missing indicator for input data """ - check_is_fitted(self, "feat_with_missing_", "n_features_") + check_is_fitted(self, "features_", "n_features_") - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, + X = check_array(X, accept_sparse=('csc', 'csr'), force_all_finite=False) if X.shape[1] != self.n_features_: raise ValueError("X has a different number of features " "than during fitting.") - imputer_mask, feat_with_missing = self._get_missing_features_info(X) + imputer_mask, features_ = self._get_missing_features_info(X) if isinstance(self.features, six.string_types): if self.features == "auto": - features = np.setdiff1d(feat_with_missing, - self.feat_with_missing_) + features = np.setdiff1d(features_, + self.features_) if self.error_on_new and features.size > 0: - raise Exception("The features %s have missing values " - "in transform but have no missing values " - "in fit" % features) + raise ValueError("The features %s have missing values " + "in transform but have no missing values " + "in fit" % features) if not (isinstance(self.features, six.string_types) and - self.features == "all") and len(self.feat_with_missing_) != 0: + self.features == "all") and len(self.features_) != 0: # no need to slice when all features have missing values - imputer_mask = imputer_mask[:, self.feat_with_missing_] + imputer_mask = imputer_mask[:, self.features_] return imputer_mask diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index e6b9c7dea37c3..77299fa2b642e 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -13,6 +13,7 @@ from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import assert_raises_regexp +from sklearn.utils.testing import assert_raises_regex from sklearn.preprocessing.imputation import Imputer from sklearn.preprocessing.imputation import MissingIndicator @@ -401,8 +402,9 @@ def test_missing_indicator(): def assert_type(actual, X_is_sparse, sparse_param, missing_values): if sparse_param is True: assert_equal(actual, sparse.csc_matrix) - elif ((sparse_param == "auto" and missing_values == 0) or - sparse_param is False): + elif (sparse_param == "auto" and missing_values == 0): + assert_equal(actual, np.ndarray) + elif sparse_param is False: assert_equal(actual, np.ndarray) else: if X_is_sparse: @@ -410,9 +412,7 @@ def assert_type(actual, X_is_sparse, sparse_param, missing_values): else: assert_equal(actual, np.ndarray) - def assert_mask(actual, expected, features): - if len(features) != 0: - expected = expected[:, features] + def assert_mask(actual, expected): if hasattr(actual, 'toarray'): assert_array_equal(actual.toarray(), expected) else: @@ -431,14 +431,18 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): sparse=sparse_param) X1_tr = indicator.fit_transform(X1_in) X2_tr = indicator.transform(X2_in) - features = indicator.feat_with_missing_ + features = indicator.features_ assert_array_equal(expect_feat_missing, features) assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, missing_values) assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, missing_values) - assert_mask(X2_tr, mask_X2, features) - assert_mask(X1_tr, mask_X1, features) + if len(features) != 0: + assert_mask(X2_tr, mask_X2[:, features]) + assert_mask(X1_tr, mask_X1[:, features]) + else: + assert_mask(X2_tr, mask_X2) + assert_mask(X1_tr, mask_X1) # features = "all" indicator = clone(indicator).set_params(features="all") @@ -449,8 +453,8 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): missing_values) assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, missing_values) - assert_mask(X2_tr, mask_X2, features) - assert_mask(X1_tr, mask_X1, features) + assert_mask(X2_tr, mask_X2[:, features]) + assert_mask(X1_tr, mask_X1[:, features]) features = [1, 2] indicator = clone(indicator).set_params(features=features) @@ -460,8 +464,8 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): missing_values) assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, missing_values) - assert_mask(X2_tr, mask_X2, features) - assert_mask(X1_tr, mask_X1, features) + assert_mask(X2_tr, mask_X2[:, features]) + assert_mask(X1_tr, mask_X1[:, features]) for X1, X2, missing_values in [(X1_orig, X2_orig, np.nan), (X1_orig + 1, X2_orig + 1, 0)]: @@ -489,17 +493,22 @@ def test_missing_indicator_error(): missing_features_tr = np.where(np.any(X2 == -1, axis=0))[0] extra_missing_features = np.setdiff1d(missing_features_tr, missing_features_fit) - err_msg = "The features \%s have missing values in transform " \ - "but have no missing values in fit" % extra_missing_features - assert_raises_regexp(Exception, err_msg, indicator.transform, X2) + err_msg = ("The features \{0} have missing values " + "in transform but have no missing values " + "in fit".format(extra_missing_features)) + assert_raises_regex(ValueError, err_msg, indicator.transform, X2) - # features = "all" - indicator = clone(indicator).set_params(features="all") - indicator.fit(X1) - indicator.transform(X2) - - # features = [0, 2] - features = [0, 2] + # features is incorrect keyword + features = "temp" indicator = clone(indicator).set_params(features=features) - indicator.fit(X1) - indicator.transform(X2) + err_msg = ("Can only use these options: 'auto', 'all' got %s" % features) + assert_raises_regex(ValueError, err_msg, indicator.fit, X1) + + indicator = clone(indicator).set_params(features=[1.0, 2.0, 3.0]) + err_msg = ("Features should be an array of integers") + assert_raises_regex(ValueError, err_msg, indicator.fit, X1) + + sparse = "temp" + indicator = clone(indicator).set_params(features="auto", sparse=sparse) + err_msg = ("sparse can only boolean or 'auto' got {0}".format(sparse)) + assert_raises_regex(ValueError, err_msg, indicator.fit, X1) From 70e06f730d6b9fac687a92f8a6575bcd239357a5 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Sun, 18 Feb 2018 10:55:49 +0530 Subject: [PATCH 24/49] Move MissingIndicator to impute.py --- doc/modules/impute.rst | 35 +++ doc/modules/preprocessing.rst | 34 --- sklearn/impute.py | 200 +++++++++++++++++- sklearn/preprocessing/__init__.py | 3 - sklearn/preprocessing/imputation.py | 200 +----------------- .../preprocessing/tests/test_imputation.py | 136 +----------- sklearn/tests/test_impute.py | 146 +++++++++++++ 7 files changed, 382 insertions(+), 372 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index e806cc2fd5b4a..dedc83c625be6 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -54,3 +54,38 @@ values than observed values. :class:`SimpleImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. + + +.. _missing_indicator: + +Imputation of missing values +===================================== + +The :class:`MissingIndicator` transformer is useful to transform a dataset into corresponding +binary matrix indicating the presence of missing values in the dataset. +This transformation is useful in conjunction with imputation. When using imputation, +preserving the information about which values had been missing can be informative. + +The ``features`` parameter is used to choose the features for which the mask is constructed. +By default, it is 'auto' which means the binary matrix has features with missing values +during fit time. When it is 'all' the matrix has all the features. + + >>> from sklearn.impute import MissingIndicator + >>> import numpy as np + >>> X1 = np.array([ + ... [-1, -1, 1, 3], + ... [ 4, -1, 0, -1], + ... [ 8, -1, 1, 0], + ... ]) + >>> indicator = MissingIndicator(missing_values=-1) + >>> X1_tr = indicator.fit_transform(X1) + >>> X1_tr + array([[1, 1, 0], + [0, 1, 1], + [0, 1, 0]]) + >>> indicator = MissingIndicator(missing_values=-1, features="all") + >>> X1_tr = indicator.fit_transform(X1) + >>> X1_tr + array([[1, 1, 0, 0], + [0, 1, 0, 1], + [0, 1, 0, 0]]) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 05a4b641bea62..19bdfc0d432a0 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -596,40 +596,6 @@ Imputation of missing values Tools for imputing missing values are discussed at :ref:`impute`. -.. _missing_indicator: - -Imputation of missing values -===================================== - -The :class:`MissingIndicator` transformer is useful to transform a dataset into corresponding -binary matrix indicating the presence of missing values in the dataset. -This transformation is useful in conjunction with imputation. When using imputation, -preserving the information about which values had been missing can be informative. - -The ``features`` parameter is used to choose the features for which the mask is constructed. -By default, it is 'auto' which means the binary matrix has features with missing values -during fit time. When it is 'all' the matrix has all the features. - - >>> from sklearn.preprocessing import MissingIndicator - >>> import numpy as np - >>> X1 = np.array([ - ... [-1, -1, 1, 3], - ... [ 4, -1, 0, -1], - ... [ 8, -1, 1, 0], - ... ]) - >>> indicator = MissingIndicator(missing_values=-1) - >>> X1_tr = indicator.fit_transform(X1) - >>> X1_tr - array([[1, 1, 0], - [0, 1, 1], - [0, 1, 0]]) - >>> indicator = MissingIndicator(missing_values=-1, features="all") - >>> X1_tr = indicator.fit_transform(X1) - >>> X1_tr - array([[1, 1, 0, 0], - [0, 1, 0, 1], - [0, 1, 0, 0]]) - .. _polynomial_features: Generating polynomial features diff --git a/sklearn/impute.py b/sklearn/impute.py index 2420e02560e42..81e2fbb046cdd 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -21,7 +21,7 @@ map = six.moves.map __all__ = [ - 'SimpleImputer', + 'SimpleImputer', 'MissingIndicator', ] @@ -321,3 +321,201 @@ def transform(self, X): X[coordinates] = values return X + + +class MissingIndicator(BaseEstimator, TransformerMixin): + """Binary indicators for missing values. + + Parameters + ---------- + missing_values : integer or np.NaN, optional (default=np.NaN) + The placeholder for the missing values. All occurrences of + ``missing_values`` will be represented as ones. + + features : {'auto' (default), 'all', array-like of int} + If "all", mask will represent all features. + If "auto", mask will only represent features with missing values + during fit time. + If mask/indices, mask will only represent features in the + indices or mask. + + sparse : boolean or "auto", optional (default="auto") + If True, the transformed ``X`` will be a sparse matrix. + If False, the transformed ``X`` will be a numpy array. + If "auto", the transformed ``X`` will be of same type as input. + + error_on_new : boolean, optional (default=True) + If True, transform will raise an error when there are features with + missing values in transform but have no missing values in fit + This is applicable only when ``features`` = "auto" + + Example + ------- + >>> from sklearn.impute import MissingIndicator + >>> import numpy as np + >>> X1 = np.array([ + ... [np.nan, 1, 3], + ... [ 4, 0, np.nan], + ... [ 8, 1, 0] + ... ]) + >>> X2 = np.array([ + ... [ 5, 1, np.nan], + ... [np.nan, 2, 3], + ... [ 2, 4, 0] + ... ]) + >>> indicator = MissingIndicator() + >>> indicator.fit(X1) + MissingIndicator(error_on_new=True, features='auto', missing_values='NaN', + sparse='auto') + >>> X2_tr = indicator.transform(X2) + >>> X2_tr + array([[0, 1], + [1, 0], + [0, 0]]) + + Attributes + ---------- + features_ : array of shape(n_missing_features,) + The features with missing values. + + n_features_ : int + The number of features in the input. + """ + + def __init__(self, missing_values="NaN", features="auto", sparse="auto", + error_on_new=True): + self.missing_values = missing_values + self.features = features + self.sparse = sparse + self.error_on_new = error_on_new + + def fit(self, X, y=None): + """Fit the transformer on X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + + Returns + ------- + self : object + Returns self. + """ + self._validate_params() + + X = check_array(X, accept_sparse=('csc', 'csr'), + force_all_finite=False) + self.n_features_ = X.shape[1] + + if isinstance(self.features, six.string_types): + if self.features == "auto": + _, self.features_ = self._get_missing_features_info(X) + else: # self.features == "all" + self.features_ = np.arange(self.n_features_) + else: + self.features_ = self.features + + return self + + def transform(self, X): + """Generate missing values indicator for X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + The input data to complete. + + Returns + ------- + Xt : array or sparse matrix, shape = [n_samples, n_features] + The missing indicator for input data + + """ + check_is_fitted(self, "features_", "n_features_") + + X = check_array(X, accept_sparse=('csc', 'csr'), + force_all_finite=False) + if X.shape[1] != self.n_features_: + raise ValueError("X has a different number of features " + "than during fitting.") + + imputer_mask, features_ = self._get_missing_features_info(X) + + if isinstance(self.features, six.string_types): + if self.features == "auto": + features = np.setdiff1d(features_, + self.features_) + if self.error_on_new and features.size > 0: + raise ValueError("The features %s have missing values " + "in transform but have no missing values " + "in fit" % features) + + if not (isinstance(self.features, six.string_types) and + self.features == "all") and len(self.features_) != 0: + # no need to slice when all features have missing values + imputer_mask = imputer_mask[:, self.features_] + + return imputer_mask + + def fit_transform(self, X, y=None): + """Generate missing values indicator for X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + The input data to complete. + + Returns + ------- + Xt : array or sparse matrix, shape = [n_samples, n_features] + The missing indicator for input data + + """ + self.fit(X, y) + return self.transform(X) + + def _validate_params(self): + if (isinstance(self.features, six.string_types) and + self.features not in ["auto", "all"]): + raise ValueError("Can only use these options: 'auto', 'all'" + " got {0}".format(self.features)) + elif not isinstance(self.features, six.string_types): + self.features = check_array(self.features, ensure_2d=False) + if (isinstance(self.features, np.ndarray) and + self.features.dtype.kind != 'i'): + raise ValueError("Features should be an array of integers") + + if not ((isinstance(self.sparse, six.string_types) and + self.sparse == "auto") or isinstance(self.sparse, bool)): + raise ValueError("sparse can only boolean or 'auto'" + " got {0}".format(self.sparse)) + + def _get_missing_features_info(self, X): + if sparse.issparse(X) and self.missing_values != 0: + # sparse matrix and missing values is not zero + imputer_mask = _get_mask(X.data, self.missing_values) + imputer_mask = X.__class__((imputer_mask, X.indices.copy(), + X.indptr.copy()), shape=X.shape, + dtype=X.dtype) + feat_with_missing = imputer_mask.sum(axis=0).nonzero()[1] + else: + # sparse with zero as missing value and dense matrix + if sparse.issparse(X): + X = X.toarray() + imputer_mask = _get_mask(X, self.missing_values) + # convert boolean mask to binary mask + imputer_mask = imputer_mask.astype(int, copy=False) + feat_with_missing = imputer_mask.sum(axis=0).nonzero()[0] + + if ((self.sparse == 'auto' and sparse.issparse(imputer_mask)) or + self.sparse is True): + imputer_mask = sparse.csc_matrix(imputer_mask) + elif self.sparse is False and sparse.issparse(imputer_mask): + imputer_mask = imputer_mask.toarray() + + if isinstance(feat_with_missing, np.matrix): + feat_with_missing = feat_with_missing.A1 + + return imputer_mask, feat_with_missing diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 021abcfaf357d..91f09d962a430 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -33,8 +33,6 @@ from .label import MultiLabelBinarizer from .imputation import Imputer -from .imputation import MissingIndicator - __all__ = [ 'Binarizer', @@ -43,7 +41,6 @@ 'KernelCenterer', 'LabelBinarizer', 'LabelEncoder', - 'MissingIndicator', 'MultiLabelBinarizer', 'MinMaxScaler', 'MaxAbsScaler', diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 592d735e7879d..4318122d4be6c 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -21,7 +21,7 @@ map = six.moves.map __all__ = [ - 'Imputer', 'MissingIndicator' + 'Imputer', ] @@ -371,201 +371,3 @@ def transform(self, X): X[coordinates] = values return X - - -class MissingIndicator(BaseEstimator, TransformerMixin): - """Binary indicators for missing values. - - Parameters - ---------- - missing_values : integer or np.NaN, optional (default=np.NaN) - The placeholder for the missing values. All occurrences of - ``missing_values`` will be represented as ones. - - features : {'auto' (default), 'all', array-like of int} - If "all", mask will represent all features. - If "auto", mask will only represent features with missing values - during fit time. - If mask/indices, mask will only represent features in the - indices or mask. - - sparse : boolean or "auto", optional (default="auto") - If True, the transformed ``X`` will be a sparse matrix. - If False, the transformed ``X`` will be a numpy array. - If "auto", the transformed ``X`` will be of same type as input. - - error_on_new : boolean, optional (default=True) - If True, transform will raise an error when there are features with - missing values in transform but have no missing values in fit - This is applicable only when ``features`` = "auto" - - Example - ------- - >>> from sklearn.preprocessing import MissingIndicator - >>> import numpy as np - >>> X1 = np.array([ - ... [np.nan, 1, 3], - ... [ 4, 0, np.nan], - ... [ 8, 1, 0] - ... ]) - >>> X2 = np.array([ - ... [ 5, 1, np.nan], - ... [np.nan, 2, 3], - ... [ 2, 4, 0] - ... ]) - >>> indicator = MissingIndicator() - >>> indicator.fit(X1) - MissingIndicator(error_on_new=True, features='auto', missing_values='NaN', - sparse='auto') - >>> X2_tr = indicator.transform(X2) - >>> X2_tr - array([[0, 1], - [1, 0], - [0, 0]]) - - Attributes - ---------- - features_ : array of shape(n_missing_features,) - The features with missing values. - - n_features_ : int - The number of features in the input. - """ - - def __init__(self, missing_values="NaN", features="auto", sparse="auto", - error_on_new=True): - self.missing_values = missing_values - self.features = features - self.sparse = sparse - self.error_on_new = error_on_new - - def fit(self, X, y=None): - """Fit the transformer on X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Input data, where ``n_samples`` is the number of samples and - ``n_features`` is the number of features. - - Returns - ------- - self : object - Returns self. - """ - self._validate_params() - - X = check_array(X, accept_sparse=('csc', 'csr'), - force_all_finite=False) - self.n_features_ = X.shape[1] - - if isinstance(self.features, six.string_types): - if self.features == "auto": - _, self.features_ = self._get_missing_features_info(X) - else: # self.features == "all" - self.features_ = np.arange(self.n_features_) - else: - self.features_ = self.features - - return self - - def transform(self, X): - """Generate missing values indicator for X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] - The input data to complete. - - Returns - ------- - Xt : array or sparse matrix, shape = [n_samples, n_features] - The missing indicator for input data - - """ - check_is_fitted(self, "features_", "n_features_") - - X = check_array(X, accept_sparse=('csc', 'csr'), - force_all_finite=False) - if X.shape[1] != self.n_features_: - raise ValueError("X has a different number of features " - "than during fitting.") - - imputer_mask, features_ = self._get_missing_features_info(X) - - if isinstance(self.features, six.string_types): - if self.features == "auto": - features = np.setdiff1d(features_, - self.features_) - if self.error_on_new and features.size > 0: - raise ValueError("The features %s have missing values " - "in transform but have no missing values " - "in fit" % features) - - if not (isinstance(self.features, six.string_types) and - self.features == "all") and len(self.features_) != 0: - # no need to slice when all features have missing values - imputer_mask = imputer_mask[:, self.features_] - - return imputer_mask - - def fit_transform(self, X, y=None): - """Generate missing values indicator for X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] - The input data to complete. - - Returns - ------- - Xt : array or sparse matrix, shape = [n_samples, n_features] - The missing indicator for input data - - """ - self.fit(X, y) - return self.transform(X) - - def _validate_params(self): - if (isinstance(self.features, six.string_types) and - self.features not in ["auto", "all"]): - raise ValueError("Can only use these options: 'auto', 'all'" - " got {0}".format(self.features)) - elif not isinstance(self.features, six.string_types): - self.features = check_array(self.features, ensure_2d=False) - if (isinstance(self.features, np.ndarray) and - self.features.dtype.kind != 'i'): - raise ValueError("Features should be an array of integers") - - if not ((isinstance(self.sparse, six.string_types) and - self.sparse == "auto") or isinstance(self.sparse, bool)): - raise ValueError("sparse can only boolean or 'auto'" - " got {0}".format(self.sparse)) - - def _get_missing_features_info(self, X): - if sparse.issparse(X) and self.missing_values != 0: - # sparse matrix and missing values is not zero - imputer_mask = _get_mask(X.data, self.missing_values) - imputer_mask = X.__class__((imputer_mask, X.indices.copy(), - X.indptr.copy()), shape=X.shape, - dtype=X.dtype) - feat_with_missing = imputer_mask.sum(axis=0).nonzero()[1] - else: - # sparse with zero as missing value and dense matrix - if sparse.issparse(X): - X = X.toarray() - imputer_mask = _get_mask(X, self.missing_values) - # convert boolean mask to binary mask - imputer_mask = imputer_mask.astype(int, copy=False) - feat_with_missing = imputer_mask.sum(axis=0).nonzero()[0] - - if ((self.sparse == 'auto' and sparse.issparse(imputer_mask)) or - self.sparse is True): - imputer_mask = sparse.csc_matrix(imputer_mask) - elif self.sparse is False and sparse.issparse(imputer_mask): - imputer_mask = imputer_mask.toarray() - - if isinstance(feat_with_missing, np.matrix): - feat_with_missing = feat_with_missing.A1 - - return imputer_mask, feat_with_missing diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 77299fa2b642e..4dad4ed856e78 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -12,16 +12,13 @@ from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_no_warnings -from sklearn.utils.testing import assert_raises_regexp -from sklearn.utils.testing import assert_raises_regex from sklearn.preprocessing.imputation import Imputer -from sklearn.preprocessing.imputation import MissingIndicator from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree from sklearn.random_projection import sparse_random_matrix -from sklearn.preprocessing.imputation import _get_mask + @ignore_warnings def _check_statistics(X, X_true, @@ -381,134 +378,3 @@ def test_imputation_copy(): # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is # made, even if copy=False. - - -def test_missing_indicator(): - X1_orig = np.array([ - [np.nan, np.nan, 1, 3], - [4, np.nan, 0, np.nan], - [8, np.nan, 1, 0], - [0, np.nan, 0, 15], - [16, np.nan, 1, 19] - ]) - X2_orig = np.array([ - [5, 1, 1, np.nan], - [np.nan, np.nan, 2, 3], - [2, 3, 4, 0], - [0, np.nan, 5, np.nan], - [11, np.nan, 1, 1] - ]) - - def assert_type(actual, X_is_sparse, sparse_param, missing_values): - if sparse_param is True: - assert_equal(actual, sparse.csc_matrix) - elif (sparse_param == "auto" and missing_values == 0): - assert_equal(actual, np.ndarray) - elif sparse_param is False: - assert_equal(actual, np.ndarray) - else: - if X_is_sparse: - assert_equal(actual, sparse.csc_matrix) - else: - assert_equal(actual, np.ndarray) - - def assert_mask(actual, expected): - if hasattr(actual, 'toarray'): - assert_array_equal(actual.toarray(), expected) - else: - assert_array_equal(actual, expected) - - def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): - mask_X2 = _get_mask(X2, missing_values) - mask_X1 = _get_mask(X1, missing_values) - - expect_feat_missing = np.where(np.any(mask_X1, axis=0))[0] - - X1_in = retype(X1) - X2_in = retype(X2) - # features = "auto": - indicator = MissingIndicator(missing_values=missing_values, - sparse=sparse_param) - X1_tr = indicator.fit_transform(X1_in) - X2_tr = indicator.transform(X2_in) - features = indicator.features_ - assert_array_equal(expect_feat_missing, features) - assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, - missing_values) - assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, - missing_values) - if len(features) != 0: - assert_mask(X2_tr, mask_X2[:, features]) - assert_mask(X1_tr, mask_X1[:, features]) - else: - assert_mask(X2_tr, mask_X2) - assert_mask(X1_tr, mask_X1) - - # features = "all" - indicator = clone(indicator).set_params(features="all") - X1_tr = indicator.fit_transform(X1_in) - X2_tr = indicator.transform(X2_in) - features = np.arange(X2.shape[1]) - assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, - missing_values) - assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, - missing_values) - assert_mask(X2_tr, mask_X2[:, features]) - assert_mask(X1_tr, mask_X1[:, features]) - - features = [1, 2] - indicator = clone(indicator).set_params(features=features) - X1_tr = indicator.fit_transform(X1_in) - X2_tr = indicator.transform(X2_in) - assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, - missing_values) - assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, - missing_values) - assert_mask(X2_tr, mask_X2[:, features]) - assert_mask(X1_tr, mask_X1[:, features]) - - for X1, X2, missing_values in [(X1_orig, X2_orig, np.nan), - (X1_orig + 1, X2_orig + 1, 0)]: - for retype in [lambda x: x.tolist(), np.array, sparse.csr_matrix, - sparse.csc_matrix, sparse.lil_matrix]: - for sparse_param in [True, False, 'auto']: - _check_missing_indicator(X1, X2, retype, sparse_param, - missing_values) - - -def test_missing_indicator_error(): - X1 = np.array([ - [-1, 1, 3], - [4, 0, -1], - [8, 1, 0] - ]) - X2 = np.array([ - [5, -1, -1], - [-1, 2, 3], - [2, 4, 0] - ]) - indicator = MissingIndicator(missing_values=-1) - indicator.fit(X1) - missing_features_fit = np.sum(X1 == -1, axis=0).nonzero()[0] - missing_features_tr = np.where(np.any(X2 == -1, axis=0))[0] - extra_missing_features = np.setdiff1d(missing_features_tr, - missing_features_fit) - err_msg = ("The features \{0} have missing values " - "in transform but have no missing values " - "in fit".format(extra_missing_features)) - assert_raises_regex(ValueError, err_msg, indicator.transform, X2) - - # features is incorrect keyword - features = "temp" - indicator = clone(indicator).set_params(features=features) - err_msg = ("Can only use these options: 'auto', 'all' got %s" % features) - assert_raises_regex(ValueError, err_msg, indicator.fit, X1) - - indicator = clone(indicator).set_params(features=[1.0, 2.0, 3.0]) - err_msg = ("Features should be an array of integers") - assert_raises_regex(ValueError, err_msg, indicator.fit, X1) - - sparse = "temp" - indicator = clone(indicator).set_params(features="auto", sparse=sparse) - err_msg = ("sparse can only boolean or 'auto' got {0}".format(sparse)) - assert_raises_regex(ValueError, err_msg, indicator.fit, X1) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index f2bf5912e2213..93e9ea891f7c7 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -2,13 +2,18 @@ import numpy as np from scipy import sparse +from sklearn.base import clone + from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false +from sklearn.utils.testing import assert_raises_regex +from sklearn.impute import _get_mask from sklearn.impute import SimpleImputer +from sklearn.impute import MissingIndicator from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree @@ -257,3 +262,144 @@ def test_imputation_copy(): Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) + + # copy=False, sparse csr, axis=1, missing_values=0 => copy + X = X_orig.copy() + imputer = SimpleImputer(missing_values=0, strategy="mean", + copy=False, axis=1) + Xt = imputer.fit(X).transform(X) + assert_false(sparse.issparse(Xt)) + + # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is + # made, even if copy=False. + + +def test_missing_indicator(): + X1_orig = np.array([ + [np.nan, np.nan, 1, 3], + [4, np.nan, 0, np.nan], + [8, np.nan, 1, 0], + [0, np.nan, 0, 15], + [16, np.nan, 1, 19] + ]) + X2_orig = np.array([ + [5, 1, 1, np.nan], + [np.nan, np.nan, 2, 3], + [2, 3, 4, 0], + [0, np.nan, 5, np.nan], + [11, np.nan, 1, 1] + ]) + + def assert_type(actual, X_is_sparse, sparse_param, missing_values): + if sparse_param is True: + assert_equal(actual, sparse.csc_matrix) + elif (sparse_param == "auto" and missing_values == 0): + assert_equal(actual, np.ndarray) + elif sparse_param is False: + assert_equal(actual, np.ndarray) + else: + if X_is_sparse: + assert_equal(actual, sparse.csc_matrix) + else: + assert_equal(actual, np.ndarray) + + def assert_mask(actual, expected): + if hasattr(actual, 'toarray'): + assert_array_equal(actual.toarray(), expected) + else: + assert_array_equal(actual, expected) + + def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): + mask_X2 = _get_mask(X2, missing_values) + mask_X1 = _get_mask(X1, missing_values) + + expect_feat_missing = np.where(np.any(mask_X1, axis=0))[0] + + X1_in = retype(X1) + X2_in = retype(X2) + # features = "auto": + indicator = MissingIndicator(missing_values=missing_values, + sparse=sparse_param) + X1_tr = indicator.fit_transform(X1_in) + X2_tr = indicator.transform(X2_in) + features = indicator.features_ + assert_array_equal(expect_feat_missing, features) + assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, + missing_values) + assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, + missing_values) + if len(features) != 0: + assert_mask(X2_tr, mask_X2[:, features]) + assert_mask(X1_tr, mask_X1[:, features]) + else: + assert_mask(X2_tr, mask_X2) + assert_mask(X1_tr, mask_X1) + + # features = "all" + indicator = clone(indicator).set_params(features="all") + X1_tr = indicator.fit_transform(X1_in) + X2_tr = indicator.transform(X2_in) + features = np.arange(X2.shape[1]) + assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, + missing_values) + assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, + missing_values) + assert_mask(X2_tr, mask_X2[:, features]) + assert_mask(X1_tr, mask_X1[:, features]) + + features = [1, 2] + indicator = clone(indicator).set_params(features=features) + X1_tr = indicator.fit_transform(X1_in) + X2_tr = indicator.transform(X2_in) + assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, + missing_values) + assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, + missing_values) + assert_mask(X2_tr, mask_X2[:, features]) + assert_mask(X1_tr, mask_X1[:, features]) + + for X1, X2, missing_values in [(X1_orig, X2_orig, np.nan), + (X1_orig + 1, X2_orig + 1, 0)]: + for retype in [lambda x: x.tolist(), np.array, sparse.csr_matrix, + sparse.csc_matrix, sparse.lil_matrix]: + for sparse_param in [True, False, 'auto']: + _check_missing_indicator(X1, X2, retype, sparse_param, + missing_values) + + +def test_missing_indicator_error(): + X1 = np.array([ + [-1, 1, 3], + [4, 0, -1], + [8, 1, 0] + ]) + X2 = np.array([ + [5, -1, -1], + [-1, 2, 3], + [2, 4, 0] + ]) + indicator = MissingIndicator(missing_values=-1) + indicator.fit(X1) + missing_features_fit = np.sum(X1 == -1, axis=0).nonzero()[0] + missing_features_tr = np.where(np.any(X2 == -1, axis=0))[0] + extra_missing_features = np.setdiff1d(missing_features_tr, + missing_features_fit) + err_msg = ("The features \{0} have missing values " + "in transform but have no missing values " + "in fit".format(extra_missing_features)) + assert_raises_regex(ValueError, err_msg, indicator.transform, X2) + + # features is incorrect keyword + features = "temp" + indicator = clone(indicator).set_params(features=features) + err_msg = ("Can only use these options: 'auto', 'all' got %s" % features) + assert_raises_regex(ValueError, err_msg, indicator.fit, X1) + + indicator = clone(indicator).set_params(features=[1.0, 2.0, 3.0]) + err_msg = ("Features should be an array of integers") + assert_raises_regex(ValueError, err_msg, indicator.fit, X1) + + sparse = "temp" + indicator = clone(indicator).set_params(features="auto", sparse=sparse) + err_msg = ("sparse can only boolean or 'auto' got {0}".format(sparse)) + assert_raises_regex(ValueError, err_msg, indicator.fit, X1) From 37f19a325451c2d3e75379f38b29f6871520bad0 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Sun, 18 Feb 2018 11:16:04 +0530 Subject: [PATCH 25/49] fix flake8 comments --- sklearn/preprocessing/tests/test_imputation.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 4dad4ed856e78..f0923ae1dcbe7 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -2,16 +2,12 @@ import numpy as np from scipy import sparse -from sklearn.base import clone - from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false from sklearn.utils.testing import ignore_warnings -from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import assert_no_warnings from sklearn.preprocessing.imputation import Imputer from sklearn.pipeline import Pipeline From 313a71bbd7eb74a87821bc429c4c0744565281b6 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Sun, 18 Feb 2018 12:06:29 +0530 Subject: [PATCH 26/49] docstring changes --- sklearn/impute.py | 4 ++-- sklearn/tests/test_impute.py | 10 ---------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 81e2fbb046cdd..174651738a28c 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -349,8 +349,8 @@ class MissingIndicator(BaseEstimator, TransformerMixin): missing values in transform but have no missing values in fit This is applicable only when ``features`` = "auto" - Example - ------- + Examples + -------- >>> from sklearn.impute import MissingIndicator >>> import numpy as np >>> X1 = np.array([ diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 93e9ea891f7c7..0fee0509c60bf 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -263,16 +263,6 @@ def test_imputation_copy(): Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) - # copy=False, sparse csr, axis=1, missing_values=0 => copy - X = X_orig.copy() - imputer = SimpleImputer(missing_values=0, strategy="mean", - copy=False, axis=1) - Xt = imputer.fit(X).transform(X) - assert_false(sparse.issparse(Xt)) - - # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is - # made, even if copy=False. - def test_missing_indicator(): X1_orig = np.array([ From 8c956c7cc7fb86e40175d76e6f8d327dd386d7da Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Apr 2018 23:03:38 +0200 Subject: [PATCH 27/49] FIX add change in estimator checks --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index c8841e7b11d1e..344b8ce6927ae 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -76,7 +76,7 @@ 'RANSACRegressor', 'RadiusNeighborsRegressor', 'RandomForestRegressor', 'Ridge', 'RidgeCV'] -ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MICEImputer', 'MissingValueIndicator' +ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MICEImputer', 'MissingIndicator', 'MinMaxScaler', 'QuantileTransformer'] From feddbcbd371c347b46e3e905f18b5ccae24df3c3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Apr 2018 23:13:20 +0200 Subject: [PATCH 28/49] FIX error during solving conflicts --- sklearn/impute.py | 4 ++++ sklearn/tests/test_impute.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 6bf48402ecb6c..08568cab911c3 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -807,6 +807,10 @@ def transform(self, X): Parameters ---------- X : array-like, shape = [n_samples, n_features] + The input data to complete. + + Returns + ------- Xt : array-like, shape (n_samples, n_features) The imputed input data. """ diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 43e4cd0ba23af..ca732ac2e0890 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -273,6 +273,9 @@ def test_imputation_copy(): Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) + # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is + # made, even if copy=False. + def test_mice_rank_one(): rng = np.random.RandomState(0) @@ -624,5 +627,3 @@ def test_missing_indicator_error(): indicator = clone(indicator).set_params(features="auto", sparse=sparse) err_msg = ("sparse can only boolean or 'auto' got {0}".format(sparse)) assert_raises_regex(ValueError, err_msg, indicator.fit, X1) - # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is - # made, even if copy=False. From 49ef207f05b4d4ec36e554984dba6557f75383f9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 27 Apr 2018 16:47:16 +0200 Subject: [PATCH 29/49] EHN address code reviews --- sklearn/impute.py | 243 ++++++++++++++++++----------------- sklearn/tests/test_impute.py | 29 ++--- 2 files changed, 134 insertions(+), 138 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 08568cab911c3..e3a4fe1a3e8e7 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -885,57 +885,55 @@ class MissingIndicator(BaseEstimator, TransformerMixin): The placeholder for the missing values. All occurrences of ``missing_values`` will be represented as ones. - features : {'auto' (default), 'all', array-like of int} - If "all", mask will represent all features. - If "auto", mask will only represent features with missing values - during fit time. - If mask/indices, mask will only represent features in the - indices or mask. - - sparse : boolean or "auto", optional (default="auto") - If True, the transformed ``X`` will be a sparse matrix. - If False, the transformed ``X`` will be a numpy array. - If "auto", the transformed ``X`` will be of same type as input. - - error_on_new : boolean, optional (default=True) - If True, transform will raise an error when there are features with - missing values in transform but have no missing values in fit - This is applicable only when ``features`` = "auto" + features : str, optional + Whether the imputer mask should represent all or a subset of + features. + + - If "missing-only" (default), the imputer mask will only represent + features containing missing values during fit time. + - If "all", the imputer mask will represent all features. + + sparse : boolean or "auto", optional + Whether the imputer mask format should be sparse or dense. + + - If "auto" (default), the imputer mask will be of same type as + input. + - If True, the imputer mask will be a sparse matrix. + - If False, the imputer mask will be a numpy array. + + error_on_new : boolean, optional + If True (default), transform will raise an error when there are + features with missing values in transform but have no missing values in + fit This is applicable only when ``features="missing-only"``. Examples -------- - >>> from sklearn.impute import MissingIndicator >>> import numpy as np - >>> X1 = np.array([ - ... [np.nan, 1, 3], - ... [ 4, 0, np.nan], - ... [ 8, 1, 0] - ... ]) - >>> X2 = np.array([ - ... [ 5, 1, np.nan], - ... [np.nan, 2, 3], - ... [ 2, 4, 0] - ... ]) + >>> from sklearn.impute import MissingIndicator + >>> X1 = np.array([[np.nan, 1, 3], + ... [4, 0, np.nan], + ... [8, 1, 0]]) + >>> X2 = np.array([[5, 1, np.nan], + ... [np.nan, 2, 3], + ... [2, 4, 0]]) >>> indicator = MissingIndicator() >>> indicator.fit(X1) - MissingIndicator(error_on_new=True, features='auto', missing_values='NaN', - sparse='auto') + MissingIndicator(error_on_new=True, features='missing-only', + missing_values='NaN', sparse='auto') >>> X2_tr = indicator.transform(X2) >>> X2_tr - array([[0, 1], - [1, 0], - [0, 0]]) + array([[0., 1.], + [1., 0.], + [0., 0.]]) Attributes ---------- - features_ : array of shape(n_missing_features,) - The features with missing values. + features_ : ndarray, shape (n_missing_features,) + The features containing missing values. - n_features_ : int - The number of features in the input. """ - def __init__(self, missing_values="NaN", features="auto", sparse="auto", + def __init__(self, missing_values="NaN", features="missing-only", sparse="auto", error_on_new=True): self.missing_values = missing_values self.features = features @@ -956,59 +954,115 @@ def fit(self, X, y=None): self : object Returns self. """ - self._validate_params() - X = check_array(X, accept_sparse=('csc', 'csr'), - force_all_finite=False) - self.n_features_ = X.shape[1] - - if isinstance(self.features, six.string_types): - if self.features == "auto": - _, self.features_ = self._get_missing_features_info(X) - else: # self.features == "all" - self.features_ = np.arange(self.n_features_) - else: - self.features_ = self.features + force_all_finite='allow-nan') + self._n_features = X.shape[1] + + if self.features not in ('missing-only', 'all'): + raise ValueError("'features' has to be either 'missing-only' or " + "'all'. Got {} instead.".format(self.features)) + + if not ((isinstance(self.sparse, six.string_types) and + self.sparse == "auto") or isinstance(self.sparse, bool)): + raise ValueError("'sparse' has to be a boolean or 'auto'. " + "Got {!r} instead.".format(self.sparse)) + + self.features_ = (self._get_missing_features_info(X)[1] + if self.features == 'missing-only' + else np.arange(self._n_features)) return self + def _get_missing_features_info(self, X): + """Compute the imputer mask and the indices of the features + containing missing values. + + Parameters + ---------- + X : {ndarray or sparse matrix}, shape (n_samples, n_features) + The input data with missing values. Note that ``X`` has been + checked in ``fit`` and ``transform`` before to call this function. + + Returns + ------- + imputer_mask : {ndarray or sparse matrix}, shape \ +(n_samples, n_features) or (n_samples, n_features_with_missing) + The imputer mask of the original data. + + features_with_missing : ndarray, shape (n_features_with_missing) + The features containing missing values. + + """ + if sparse.issparse(X) and self.missing_values != 0: + mask = _get_mask(X.data, self.missing_values) + + # The imputer mask will be constructed with the same sparse format + # as X. + sparse_constructor = (sparse.csr_matrix if X.format == 'csr' + else sparse.csc_matrix) + imputer_mask = sparse_constructor( + (mask, X.indices.copy(), X.indptr.copy()), + shape=X.shape, dtype=X.dtype) + + missing_values_mask = imputer_mask.copy() + missing_values_mask.eliminate_zeros() + features_with_missing = ( + np.flatnonzero(np.diff(missing_values_mask.indptr)) + if missing_values_mask.format == 'csc' + else np.unique(missing_values_mask.indices)) + + if self.sparse is False: + imputer_mask = imputer_mask.toarray() + elif imputer_mask.format == 'csr': + imputer_mask = imputer_mask.tocsc() + else: + if sparse.issparse(X): + # case of sparse matrix with 0 as missing values. Implicit and + # explicit zeros are considered as missing values. + X = X.toarray() + imputer_mask = _get_mask(X, self.missing_values).astype(X.dtype, + copy=False) + features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0)) + + if self.sparse is True: + imputer_mask = sparse.csc_matrix(imputer_mask) + + return imputer_mask, features_with_missing + def transform(self, X): """Generate missing values indicator for X. Parameters ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] + X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. Returns ------- - Xt : array or sparse matrix, shape = [n_samples, n_features] - The missing indicator for input data + Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) + The missing indicator for input data. """ - check_is_fitted(self, "features_", "n_features_") + check_is_fitted(self, "features_") X = check_array(X, accept_sparse=('csc', 'csr'), - force_all_finite=False) - if X.shape[1] != self.n_features_: + force_all_finite='allow-nan') + if X.shape[1] != self._n_features: raise ValueError("X has a different number of features " "than during fitting.") - imputer_mask, features_ = self._get_missing_features_info(X) + imputer_mask, features = self._get_missing_features_info(X) - if isinstance(self.features, six.string_types): - if self.features == "auto": - features = np.setdiff1d(features_, - self.features_) - if self.error_on_new and features.size > 0: - raise ValueError("The features %s have missing values " - "in transform but have no missing values " - "in fit" % features) + if self.features == "missing-only": + features_diff_fit_trans = np.setdiff1d(features, self.features_) + if (self.error_on_new and features_diff_fit_trans.size > 0): + raise ValueError("The features {} have missing values " + "in transform but have no missing values " + "in fit.".format(features_diff_fit_trans)) - if not (isinstance(self.features, six.string_types) and - self.features == "all") and len(self.features_) != 0: - # no need to slice when all features have missing values - imputer_mask = imputer_mask[:, self.features_] + if (self.features_.size > 0 and + self.features_.size < self._n_features): + imputer_mask = imputer_mask[:, self.features_] return imputer_mask @@ -1017,58 +1071,13 @@ def fit_transform(self, X, y=None): Parameters ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] + X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. Returns ------- - Xt : array or sparse matrix, shape = [n_samples, n_features] - The missing indicator for input data + Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) + The missing indicator for input data. """ - self.fit(X, y) - return self.transform(X) - - def _validate_params(self): - if (isinstance(self.features, six.string_types) and - self.features not in ["auto", "all"]): - raise ValueError("Can only use these options: 'auto', 'all'" - " got {0}".format(self.features)) - elif not isinstance(self.features, six.string_types): - self.features = check_array(self.features, ensure_2d=False) - if (isinstance(self.features, np.ndarray) and - self.features.dtype.kind != 'i'): - raise ValueError("Features should be an array of integers") - - if not ((isinstance(self.sparse, six.string_types) and - self.sparse == "auto") or isinstance(self.sparse, bool)): - raise ValueError("sparse can only boolean or 'auto'" - " got {0}".format(self.sparse)) - - def _get_missing_features_info(self, X): - if sparse.issparse(X) and self.missing_values != 0: - # sparse matrix and missing values is not zero - imputer_mask = _get_mask(X.data, self.missing_values) - imputer_mask = X.__class__((imputer_mask, X.indices.copy(), - X.indptr.copy()), shape=X.shape, - dtype=X.dtype) - feat_with_missing = imputer_mask.sum(axis=0).nonzero()[1] - else: - # sparse with zero as missing value and dense matrix - if sparse.issparse(X): - X = X.toarray() - imputer_mask = _get_mask(X, self.missing_values) - # convert boolean mask to binary mask - imputer_mask = imputer_mask.astype(int, copy=False) - feat_with_missing = imputer_mask.sum(axis=0).nonzero()[0] - - if ((self.sparse == 'auto' and sparse.issparse(imputer_mask)) or - self.sparse is True): - imputer_mask = sparse.csc_matrix(imputer_mask) - elif self.sparse is False and sparse.issparse(imputer_mask): - imputer_mask = imputer_mask.toarray() - - if isinstance(feat_with_missing, np.matrix): - feat_with_missing = feat_with_missing.A1 - - return imputer_mask, feat_with_missing + return self.fit(X, y).transform(X) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index ca732ac2e0890..f392340510ed8 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -9,7 +9,6 @@ from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_raises_regex from sklearn.impute import _get_mask @@ -541,7 +540,7 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): X1_in = retype(X1) X2_in = retype(X2) - # features = "auto": + # features = "missing-only": indicator = MissingIndicator(missing_values=missing_values, sparse=sparse_param) X1_tr = indicator.fit_transform(X1_in) @@ -571,17 +570,6 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): assert_mask(X2_tr, mask_X2[:, features]) assert_mask(X1_tr, mask_X1[:, features]) - features = [1, 2] - indicator = clone(indicator).set_params(features=features) - X1_tr = indicator.fit_transform(X1_in) - X2_tr = indicator.transform(X2_in) - assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, - missing_values) - assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, - missing_values) - assert_mask(X2_tr, mask_X2[:, features]) - assert_mask(X1_tr, mask_X1[:, features]) - for X1, X2, missing_values in [(X1_orig, X2_orig, np.nan), (X1_orig + 1, X2_orig + 1, 0)]: for retype in [lambda x: x.tolist(), np.array, sparse.csr_matrix, @@ -610,20 +598,19 @@ def test_missing_indicator_error(): missing_features_fit) err_msg = ("The features \{0} have missing values " "in transform but have no missing values " - "in fit".format(extra_missing_features)) + "in fit.".format(extra_missing_features)) assert_raises_regex(ValueError, err_msg, indicator.transform, X2) # features is incorrect keyword features = "temp" indicator = clone(indicator).set_params(features=features) - err_msg = ("Can only use these options: 'auto', 'all' got %s" % features) - assert_raises_regex(ValueError, err_msg, indicator.fit, X1) - - indicator = clone(indicator).set_params(features=[1.0, 2.0, 3.0]) - err_msg = ("Features should be an array of integers") + err_msg = ("'features' has to be either 'missing-only' or 'all'. " + "Got %s instead" % features) assert_raises_regex(ValueError, err_msg, indicator.fit, X1) sparse = "temp" - indicator = clone(indicator).set_params(features="auto", sparse=sparse) - err_msg = ("sparse can only boolean or 'auto' got {0}".format(sparse)) + indicator = clone(indicator).set_params(features="missing-only", + sparse=sparse) + err_msg = ("'sparse' has to be a boolean or 'auto'. Got {!r} instead" + .format(sparse)) assert_raises_regex(ValueError, err_msg, indicator.fit, X1) From 712b2f48d99cf45799af4cbdf06cd0b3f6513d2f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 27 Apr 2018 17:53:15 +0200 Subject: [PATCH 30/49] PEP8 --- sklearn/impute.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index e3a4fe1a3e8e7..fa8e1a38a7457 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -933,8 +933,8 @@ class MissingIndicator(BaseEstimator, TransformerMixin): """ - def __init__(self, missing_values="NaN", features="missing-only", sparse="auto", - error_on_new=True): + def __init__(self, missing_values="NaN", features="missing-only", + sparse="auto", error_on_new=True): self.missing_values = missing_values self.features = features self.sparse = sparse From 5c495e1ac00f48623b72a5853d59c25bb0ff4007 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 27 Apr 2018 18:14:41 +0200 Subject: [PATCH 31/49] DOC address comments documentation --- doc/modules/impute.rst | 56 +++++++++++-------- sklearn/impute.py | 12 ++-- .../preprocessing/tests/test_imputation.py | 2 +- 3 files changed, 41 insertions(+), 29 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 63662da433cc9..1d0921fa16d7d 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -100,27 +100,37 @@ dataset. This transformation is useful in conjunction with imputation. When using imputation, preserving the information about which values had been missing can be informative. -The ``features`` parameter is used to choose the features for which the mask is -constructed. By default, it is 'auto' which means the binary matrix has -features with missing values during fit time. When it is 'all' the matrix has -all the features. +``NaN`` is usually used as the placeholder for missing values. However, it +enforces the data type to be float. The parameter ``missing_values`` allows to +specify other placeholder such as integer. In the following example, we will +use ``-1`` as missing values:: + + >>> from sklearn.impute import MissingIndicator + >>> X = np.array([[-1, -1, 1, 3], + ... [4, -1, 0, -1], + ... [8, -1, 1, 0]]) + >>> indicator = MissingIndicator(missing_values=-1) + >>> mask_missing_values_only = indicator.fit_transform(X) + >>> mask_missing_values_only + array([[1, 1, 0], + [0, 1, 1], + [0, 1, 0]]) - >>> from sklearn.impute import MissingIndicator - >>> import numpy as np - >>> X1 = np.array([ - ... [-1, -1, 1, 3], - ... [ 4, -1, 0, -1], - ... [ 8, -1, 1, 0], - ... ]) - >>> indicator = MissingIndicator(missing_values=-1) - >>> X1_tr = indicator.fit_transform(X1) - >>> X1_tr - array([[1, 1, 0], - [0, 1, 1], - [0, 1, 0]]) - >>> indicator = MissingIndicator(missing_values=-1, features="all") - >>> X1_tr = indicator.fit_transform(X1) - >>> X1_tr - array([[1, 1, 0, 0], - [0, 1, 0, 1], - [0, 1, 0, 0]]) +The ``features`` parameter is used to choose the features for which the mask is +constructed. By default, it is ``'missing-only'`` which returns the imputer +mask of the features containing missing values at ``fit`` time:: + + >>> indicator.features_ + array([0, 1, 3]) + +The ``features`` parameter can be set to ``'all'`` to returned all features +whether or not they contain missing values:: + + >>> indicator = MissingIndicator(missing_values=-1, features="all") + >>> mask_all = indicator.fit_transform(X) + >>> mask_all + array([[1, 1, 0, 0], + [0, 1, 0, 1], + [0, 1, 0, 0]]) + >>> indicator.features_ + array([0, 1, 2, 3]) diff --git a/sklearn/impute.py b/sklearn/impute.py index fa8e1a38a7457..a63e42b3c0862 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -906,6 +906,13 @@ class MissingIndicator(BaseEstimator, TransformerMixin): features with missing values in transform but have no missing values in fit This is applicable only when ``features="missing-only"``. + Attributes + ---------- + features_ : ndarray, shape (n_missing_features,) or (n_features,) + The features indices which will be returned when calling ``transform``. + They are computed during ``fit``. For ``features='all'``, it is + to ``range(n_features)``. + Examples -------- >>> import numpy as np @@ -926,11 +933,6 @@ class MissingIndicator(BaseEstimator, TransformerMixin): [1., 0.], [0., 0.]]) - Attributes - ---------- - features_ : ndarray, shape (n_missing_features,) - The features containing missing values. - """ def __init__(self, missing_values="NaN", features="missing-only", diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index f0923ae1dcbe7..663262b50289b 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -8,8 +8,8 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false from sklearn.utils.testing import ignore_warnings -from sklearn.preprocessing.imputation import Imputer +from sklearn.preprocessing.imputation import Imputer from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree From 1efcd828d9cf711e9eed8bf2defa671889841648 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 27 Apr 2018 18:29:47 +0200 Subject: [PATCH 32/49] TST parametrize error test --- sklearn/tests/test_impute.py | 50 ++++++++++++------------------------ 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index f392340510ed8..d1492feffdcfd 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -579,38 +579,20 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): missing_values) -def test_missing_indicator_error(): - X1 = np.array([ - [-1, 1, 3], - [4, 0, -1], - [8, 1, 0] - ]) - X2 = np.array([ - [5, -1, -1], - [-1, 2, 3], - [2, 4, 0] - ]) +@pytest.mark.parametrize( + "X_fit, X_trans, params, msg_err", + [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]), + {'features': 'missing-only', 'sparse': 'auto'}, + 'have missing values in transform but have no missing values in fit'), + (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), + {'features': 'random', 'sparse': 'auto'}, + "'features' has to be either 'missing-only' or 'all'"), + (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), + {'features': 'all', 'sparse': 'random'}, + "'sparse' has to be a boolean or 'auto'")] +) +def test_missing_indicator_error_new(X_fit, X_trans, params, msg_err): indicator = MissingIndicator(missing_values=-1) - indicator.fit(X1) - missing_features_fit = np.sum(X1 == -1, axis=0).nonzero()[0] - missing_features_tr = np.where(np.any(X2 == -1, axis=0))[0] - extra_missing_features = np.setdiff1d(missing_features_tr, - missing_features_fit) - err_msg = ("The features \{0} have missing values " - "in transform but have no missing values " - "in fit.".format(extra_missing_features)) - assert_raises_regex(ValueError, err_msg, indicator.transform, X2) - - # features is incorrect keyword - features = "temp" - indicator = clone(indicator).set_params(features=features) - err_msg = ("'features' has to be either 'missing-only' or 'all'. " - "Got %s instead" % features) - assert_raises_regex(ValueError, err_msg, indicator.fit, X1) - - sparse = "temp" - indicator = clone(indicator).set_params(features="missing-only", - sparse=sparse) - err_msg = ("'sparse' has to be a boolean or 'auto'. Got {!r} instead" - .format(sparse)) - assert_raises_regex(ValueError, err_msg, indicator.fit, X1) + indicator.set_params(**params) + with pytest.raises(ValueError, match=msg_err): + indicator.fit(X_fit).transform(X_trans) From 50bc29c5c185b7beb8c710ea714568130ed3ae2c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 27 Apr 2018 18:35:43 +0200 Subject: [PATCH 33/49] reverse useless change --- sklearn/preprocessing/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 91f09d962a430..ba0884613c124 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -34,6 +34,7 @@ from .imputation import Imputer + __all__ = [ 'Binarizer', 'FunctionTransformer', From e3abbc60693b6a312d2fe5850170a1fa92bec4fe Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 27 Apr 2018 19:29:55 +0200 Subject: [PATCH 34/49] PEP8 --- sklearn/tests/test_impute.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index d1492feffdcfd..6760301dad5e9 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -10,7 +10,6 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_false -from sklearn.utils.testing import assert_raises_regex from sklearn.impute import _get_mask from sklearn.impute import SimpleImputer From 492967ca30c8a1f295e32225724cd33fd974234f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 27 Apr 2018 23:21:20 +0200 Subject: [PATCH 35/49] TST parametrize test and split tests --- sklearn/impute.py | 66 ++++++------- sklearn/tests/test_impute.py | 180 ++++++++++++++++++----------------- 2 files changed, 128 insertions(+), 118 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index a63e42b3c0862..9200162e16316 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -942,39 +942,6 @@ def __init__(self, missing_values="NaN", features="missing-only", self.sparse = sparse self.error_on_new = error_on_new - def fit(self, X, y=None): - """Fit the transformer on X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Input data, where ``n_samples`` is the number of samples and - ``n_features`` is the number of features. - - Returns - ------- - self : object - Returns self. - """ - X = check_array(X, accept_sparse=('csc', 'csr'), - force_all_finite='allow-nan') - self._n_features = X.shape[1] - - if self.features not in ('missing-only', 'all'): - raise ValueError("'features' has to be either 'missing-only' or " - "'all'. Got {} instead.".format(self.features)) - - if not ((isinstance(self.sparse, six.string_types) and - self.sparse == "auto") or isinstance(self.sparse, bool)): - raise ValueError("'sparse' has to be a boolean or 'auto'. " - "Got {!r} instead.".format(self.sparse)) - - self.features_ = (self._get_missing_features_info(X)[1] - if self.features == 'missing-only' - else np.arange(self._n_features)) - - return self - def _get_missing_features_info(self, X): """Compute the imputer mask and the indices of the features containing missing values. @@ -1031,6 +998,39 @@ def _get_missing_features_info(self, X): return imputer_mask, features_with_missing + def fit(self, X, y=None): + """Fit the transformer on X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + + Returns + ------- + self : object + Returns self. + """ + X = check_array(X, accept_sparse=('csc', 'csr'), + force_all_finite='allow-nan') + self._n_features = X.shape[1] + + if self.features not in ('missing-only', 'all'): + raise ValueError("'features' has to be either 'missing-only' or " + "'all'. Got {} instead.".format(self.features)) + + if not ((isinstance(self.sparse, six.string_types) and + self.sparse == "auto") or isinstance(self.sparse, bool)): + raise ValueError("'sparse' has to be a boolean or 'auto'. " + "Got {!r} instead.".format(self.sparse)) + + self.features_ = (self._get_missing_features_info(X)[1] + if self.features == 'missing-only' + else np.arange(self._n_features)) + + return self + def transform(self, X): """Generate missing values indicator for X. diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 6760301dad5e9..7907610d2b6a9 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -5,12 +5,10 @@ import numpy as np from scipy import sparse -from sklearn.base import clone from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_false -from sklearn.impute import _get_mask from sklearn.impute import SimpleImputer from sklearn.impute import MICEImputer @@ -496,88 +494,6 @@ def test_mice_additive_matrix(): assert_allclose(X_test_filled, X_test_est, atol=0.01) -def test_missing_indicator(): - X1_orig = np.array([ - [np.nan, np.nan, 1, 3], - [4, np.nan, 0, np.nan], - [8, np.nan, 1, 0], - [0, np.nan, 0, 15], - [16, np.nan, 1, 19] - ]) - X2_orig = np.array([ - [5, 1, 1, np.nan], - [np.nan, np.nan, 2, 3], - [2, 3, 4, 0], - [0, np.nan, 5, np.nan], - [11, np.nan, 1, 1] - ]) - - def assert_type(actual, X_is_sparse, sparse_param, missing_values): - if sparse_param is True: - assert actual == sparse.csc_matrix - elif (sparse_param == "auto" and missing_values == 0): - assert actual == np.ndarray - elif sparse_param is False: - assert actual == np.ndarray - else: - if X_is_sparse: - assert actual == sparse.csc_matrix - else: - assert actual == np.ndarray - - def assert_mask(actual, expected): - if hasattr(actual, 'toarray'): - assert_array_equal(actual.toarray(), expected) - else: - assert_array_equal(actual, expected) - - def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): - mask_X2 = _get_mask(X2, missing_values) - mask_X1 = _get_mask(X1, missing_values) - - expect_feat_missing = np.where(np.any(mask_X1, axis=0))[0] - - X1_in = retype(X1) - X2_in = retype(X2) - # features = "missing-only": - indicator = MissingIndicator(missing_values=missing_values, - sparse=sparse_param) - X1_tr = indicator.fit_transform(X1_in) - X2_tr = indicator.transform(X2_in) - features = indicator.features_ - assert_array_equal(expect_feat_missing, features) - assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, - missing_values) - assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, - missing_values) - if len(features) != 0: - assert_mask(X2_tr, mask_X2[:, features]) - assert_mask(X1_tr, mask_X1[:, features]) - else: - assert_mask(X2_tr, mask_X2) - assert_mask(X1_tr, mask_X1) - - # features = "all" - indicator = clone(indicator).set_params(features="all") - X1_tr = indicator.fit_transform(X1_in) - X2_tr = indicator.transform(X2_in) - features = np.arange(X2.shape[1]) - assert_type(type(X1_tr), sparse.issparse(X1_in), sparse_param, - missing_values) - assert_type(type(X2_tr), sparse.issparse(X2_in), sparse_param, - missing_values) - assert_mask(X2_tr, mask_X2[:, features]) - assert_mask(X1_tr, mask_X1[:, features]) - - for X1, X2, missing_values in [(X1_orig, X2_orig, np.nan), - (X1_orig + 1, X2_orig + 1, 0)]: - for retype in [lambda x: x.tolist(), np.array, sparse.csr_matrix, - sparse.csc_matrix, sparse.lil_matrix]: - for sparse_param in [True, False, 'auto']: - _check_missing_indicator(X1, X2, retype, sparse_param, - missing_values) - - @pytest.mark.parametrize( "X_fit, X_trans, params, msg_err", [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]), @@ -590,8 +506,102 @@ def _check_missing_indicator(X1, X2, retype, sparse_param, missing_values): {'features': 'all', 'sparse': 'random'}, "'sparse' has to be a boolean or 'auto'")] ) -def test_missing_indicator_error_new(X_fit, X_trans, params, msg_err): +def test_missing_indicator_error(X_fit, X_trans, params, msg_err): indicator = MissingIndicator(missing_values=-1) indicator.set_params(**params) with pytest.raises(ValueError, match=msg_err): indicator.fit(X_fit).transform(X_trans) + + +@pytest.mark.parametrize( + "missing_values, dtype", + [(np.nan, np.float64), + (0, np.int32), + (-1, np.int32)]) +@pytest.mark.parametrize( + "arr_type", + [np.array, sparse.csc_matrix, sparse.csr_matrix, sparse.coo_matrix, + sparse.lil_matrix, sparse.bsr_matrix]) +@pytest.mark.parametrize( + "param_features, n_features, features_indices", + [('missing-only', 2, np.array([0, 1])), + ('all', 3, np.array([0, 1, 2]))]) +def test_missing_indicator_new(missing_values, arr_type, dtype, param_features, + n_features, features_indices): + X_fit = np.array([[missing_values, missing_values, 1], + [4, missing_values, 2]]) + X_trans = np.array([[missing_values, missing_values, 1], + [4, 12, 10]]) + X_fit_expected = np.array([[1, 1, 0], [0, 1, 0]]) + X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]]) + + # convert the input to the right array format and right dtype + X_fit = arr_type(X_fit).astype(dtype) + X_trans = arr_type(X_trans).astype(dtype) + X_fit_expected = X_fit_expected.astype(dtype) + X_trans_expected = X_trans_expected.astype(dtype) + + indicator = MissingIndicator(missing_values=missing_values, + features=param_features, + sparse=False) + X_fit_mask = indicator.fit_transform(X_fit) + X_trans_mask = indicator.transform(X_trans) + + assert X_fit_mask.shape[1] == n_features + assert X_trans_mask.shape[1] == n_features + + assert_array_equal(indicator.features_, features_indices) + assert_allclose(X_fit_mask, X_fit_expected[:, features_indices]) + assert_allclose(X_trans_mask, X_trans_expected[:, features_indices]) + + assert X_fit_mask.dtype == dtype + assert X_trans_mask.dtype == dtype + assert isinstance(X_fit_mask, np.ndarray) + assert isinstance(X_trans_mask, np.ndarray) + + indicator.set_params(sparse=True) + X_fit_mask_sparse = indicator.fit_transform(X_fit) + X_trans_mask_sparse = indicator.transform(X_trans) + + assert X_fit_mask_sparse.format == 'csc' + assert X_trans_mask_sparse.format == 'csc' + assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask) + assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask) + + +@pytest.mark.parametrize("param_sparse", [True, False, 'auto']) +@pytest.mark.parametrize("missing_values", [np.nan, 0]) +@pytest.mark.parametrize( + "arr_type", + [np.array, sparse.csc_matrix, sparse.csr_matrix, sparse.coo_matrix]) +def test_missing_indicator_sparse_param(arr_type, missing_values, + param_sparse): + # check the format of the output with different sparse parameter + X_fit = np.array([[missing_values, missing_values, 1], + [4, missing_values, 2]]) + X_trans = np.array([[missing_values, missing_values, 1], + [4, 12, 10]]) + X_fit = arr_type(X_fit).astype(np.float64) + X_trans = arr_type(X_trans).astype(np.float64) + + indicator = MissingIndicator(missing_values=missing_values, + sparse=param_sparse) + X_fit_mask = indicator.fit_transform(X_fit) + X_trans_mask = indicator.transform(X_trans) + + if param_sparse is True: + assert X_fit_mask.format == 'csc' + assert X_trans_mask.fomat == 'csc' + elif param_sparse == 'auto' and missing_values == 0: + assert isinstance(X_fit_mask, np.ndarray) + assert isinstance(X_trans_mask, np.ndarray) + elif param_sparse is False: + assert isinstance(X_fit_mask, np.ndarray) + assert isinstance(X_trans_mask, np.ndarray) + else: + if sparse.isparse(X_fit): + assert X_fit_mask.format == 'csc' + assert X_trans_mask.fomat == 'csc' + else: + assert isinstance(X_fit_mask, np.ndarray) + assert isinstance(X_trans_mask, np.ndarray) From 7df0d14160f569c1050c7230e59b96f0abc84408 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 27 Apr 2018 23:39:45 +0200 Subject: [PATCH 36/49] FIX typo in tests --- doc/modules/classes.rst | 3 ++- doc/whats_new/v0.20.rst | 4 ++++ sklearn/tests/test_impute.py | 6 +++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 243c63ab0c7e2..fea5e6eb2113e 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -645,8 +645,9 @@ Kernels: :toctree: generated/ :template: class.rst - impute.SimpleImputer impute.MICEImputer + impute.MissingIndicator + impute.SimpleImputer .. _kernel_approximation_ref: diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 29648d8a5ed93..43b158380d143 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -97,6 +97,10 @@ Preprocessing - Updated :class:`preprocessing.MinMaxScaler` to pass through NaN values. :issue:`10404` by :user:`Lucija Gregov `. +- Added :class:`MissingIndicator` which generate binary indicator for missing + values. :issue:`8075` by :user:`Maniteja Nandana ` and + :user:`Guillaume Lemaitre `. + Model evaluation - Added the :func:`metrics.balanced_accuracy_score` metric and a corresponding diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 7907610d2b6a9..14eafc5e3e119 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -591,7 +591,7 @@ def test_missing_indicator_sparse_param(arr_type, missing_values, if param_sparse is True: assert X_fit_mask.format == 'csc' - assert X_trans_mask.fomat == 'csc' + assert X_trans_mask.format == 'csc' elif param_sparse == 'auto' and missing_values == 0: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) @@ -599,9 +599,9 @@ def test_missing_indicator_sparse_param(arr_type, missing_values, assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) else: - if sparse.isparse(X_fit): + if sparse.issparse(X_fit): assert X_fit_mask.format == 'csc' - assert X_trans_mask.fomat == 'csc' + assert X_trans_mask.format == 'csc' else: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) From 12103adf8fa807ae0e7e128cf9dd51eade4f3999 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 28 Apr 2018 15:08:34 +0200 Subject: [PATCH 37/49] FIX change default type to bool --- doc/modules/impute.rst | 12 ++++++------ sklearn/impute.py | 11 +++++------ sklearn/tests/test_impute.py | 6 ++++-- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 1d0921fa16d7d..d4414cac014ea 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -112,9 +112,9 @@ use ``-1`` as missing values:: >>> indicator = MissingIndicator(missing_values=-1) >>> mask_missing_values_only = indicator.fit_transform(X) >>> mask_missing_values_only - array([[1, 1, 0], - [0, 1, 1], - [0, 1, 0]]) + array([[ True, True, False], + [False, True, True], + [False, True, False]]) The ``features`` parameter is used to choose the features for which the mask is constructed. By default, it is ``'missing-only'`` which returns the imputer @@ -129,8 +129,8 @@ whether or not they contain missing values:: >>> indicator = MissingIndicator(missing_values=-1, features="all") >>> mask_all = indicator.fit_transform(X) >>> mask_all - array([[1, 1, 0, 0], - [0, 1, 0, 1], - [0, 1, 0, 0]]) + array([[ True, True, False, False], + [False, True, False, True], + [False, True, False, False]]) >>> indicator.features_ array([0, 1, 2, 3]) diff --git a/sklearn/impute.py b/sklearn/impute.py index 9200162e16316..b7347720d4de7 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -929,9 +929,9 @@ class MissingIndicator(BaseEstimator, TransformerMixin): missing_values='NaN', sparse='auto') >>> X2_tr = indicator.transform(X2) >>> X2_tr - array([[0., 1.], - [1., 0.], - [0., 0.]]) + array([[False, True], + [ True, False], + [False, False]]) """ @@ -971,7 +971,7 @@ def _get_missing_features_info(self, X): else sparse.csc_matrix) imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), - shape=X.shape, dtype=X.dtype) + shape=X.shape, dtype=bool) missing_values_mask = imputer_mask.copy() missing_values_mask.eliminate_zeros() @@ -989,8 +989,7 @@ def _get_missing_features_info(self, X): # case of sparse matrix with 0 as missing values. Implicit and # explicit zeros are considered as missing values. X = X.toarray() - imputer_mask = _get_mask(X, self.missing_values).astype(X.dtype, - copy=False) + imputer_mask = _get_mask(X, self.missing_values) features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0)) if self.sparse is True: diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 14eafc5e3e119..6fc36dc70418f 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -554,8 +554,8 @@ def test_missing_indicator_new(missing_values, arr_type, dtype, param_features, assert_allclose(X_fit_mask, X_fit_expected[:, features_indices]) assert_allclose(X_trans_mask, X_trans_expected[:, features_indices]) - assert X_fit_mask.dtype == dtype - assert X_trans_mask.dtype == dtype + assert X_fit_mask.dtype == bool + assert X_trans_mask.dtype == bool assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) @@ -563,6 +563,8 @@ def test_missing_indicator_new(missing_values, arr_type, dtype, param_features, X_fit_mask_sparse = indicator.fit_transform(X_fit) X_trans_mask_sparse = indicator.transform(X_trans) + assert X_fit_mask_sparse.dtype == bool + assert X_trans_mask_sparse.dtype == bool assert X_fit_mask_sparse.format == 'csc' assert X_trans_mask_sparse.format == 'csc' assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask) From 007c6e31984bfd2537c7af34366fad6d57702104 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 28 Apr 2018 15:10:28 +0200 Subject: [PATCH 38/49] EHN add a not regarding the default dtype --- sklearn/impute.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index b7347720d4de7..0b596f90b898b 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1041,7 +1041,8 @@ def transform(self, X): Returns ------- Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) - The missing indicator for input data. + The missing indicator for input data. The data type of ``Xt`` + will be boolean. """ check_is_fitted(self, "features_") @@ -1078,7 +1079,8 @@ def fit_transform(self, X, y=None): Returns ------- Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) - The missing indicator for input data. + The missing indicator for input data. The data type of ``Xt`` + will be boolean. """ return self.fit(X, y).transform(X) From 74679e65dbfb0b4d71afc456f9e7bc3b6d90511c Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 26 Jun 2018 14:19:35 +1000 Subject: [PATCH 39/49] Insert missing comma --- sklearn/impute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index d30e72c03ce42..6a476752c8a57 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -36,7 +36,7 @@ __all__ = [ 'MissingIndicator', - 'SimpleImputer' + 'SimpleImputer', 'ChainedImputer', ] From 754c4e30e44e0cf561f674b78c3c99ab3108653e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jun 2018 12:04:58 +0200 Subject: [PATCH 40/49] update --- sklearn/impute.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 6a476752c8a57..8d9658b21a421 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -973,9 +973,9 @@ class MissingIndicator(BaseEstimator, TransformerMixin): Parameters ---------- - missing_values : integer or np.NaN, optional (default=np.NaN) + missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of - ``missing_values`` will be represented as ones. + `missing_values` will be imputed. features : str, optional Whether the imputer mask should represent all or a subset of @@ -1027,7 +1027,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): """ - def __init__(self, missing_values="NaN", features="missing-only", + def __init__(self, missing_values=np.nan, features="missing-only", sparse="auto", error_on_new=True): self.missing_values = missing_values self.features = features From b895f7c3c460457c074a0dba6cba732b4e1ec992 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jun 2018 15:49:14 +0200 Subject: [PATCH 41/49] FIX raise error with inconsistent dtype X and missing_values --- sklearn/impute.py | 14 +++++++++++++- sklearn/tests/test_impute.py | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 836bfb0167add..1c30442d5952a 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -40,6 +40,17 @@ ] +def _check_inputs_dtype(X, missing_values): + """Check that the dtype of X is in accordance with the one of + missing_values.""" + if (X.dtype.kind in ("f", "i", "u") and + not isinstance(missing_values, numbers.Real)): + raise TypeError("The data type of 'missing_values' and 'X' are " + "not compatible. 'missing_values' data type is " + "{} and 'X' is {}." + .format(type(missing_values), X.dtype)) + + def _get_mask(X, value_to_mask): """Compute the boolean mask X == missing_values.""" if value_to_mask is np.nan: @@ -51,7 +62,6 @@ def _get_mask(X, value_to_mask): else: # np.isnan does not work on object dtypes. return _object_dtype_isnan(X) - else: # X == value_to_mask with object dytpes does not always perform # element-wise for old versions of numpy @@ -183,6 +193,7 @@ def _validate_input(self, X): else: raise ve + _check_inputs_dtype(X, self.missing_values) if X.dtype.kind not in ("i", "u", "f", "O"): raise ValueError("SimpleImputer does not support data with dtype " "{0}. Please provide either a numeric array (with" @@ -788,6 +799,7 @@ def _initial_imputation(self, X): X = check_array(X, dtype=FLOAT_DTYPES, order="F", force_all_finite=force_all_finite) + _check_inputs_dtype(X, self.missing_values) mask_missing_values = _get_mask(X, self.missing_values) if self.initial_imputer_ is None: diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index f5c42f7443487..672c87b8d2fdf 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -705,3 +705,23 @@ def test_chained_imputer_additive_matrix(): random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, atol=0.01) + + +@pytest.mark.parametrize("imputer_constructor", + [SimpleImputer, ChainedImputer]) +@pytest.mark.parametrize( + "missing_values, X_missing_value, err_type, err_msg", + [("NaN", np.nan, ValueError, "contains"), + ("-1", -1, TypeError, "not compatible")]) +def test_inconsistent_dtype_X_missing_values(imputer_constructor, + missing_values, X_missing_value, + err_type, err_msg): + # regression test for issue #11390. Comparison between incoherent dtype + # for X and missing_values was not raising a proper error. + X = np.random.randn(1000, 10) + X[0, 0] = X_missing_value + + imputer = imputer_constructor(missing_values=missing_values) + + with pytest.raises(err_type, match=err_msg): + imputer.fit_transform(X) From 44dbc91ca6067bf6bb8dc43c9feeae21bfd3ae7a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jun 2018 16:01:18 +0200 Subject: [PATCH 42/49] solve issue with NaN as string --- sklearn/impute.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 4aea65e6fe1b9..1f3a6e131a965 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1115,8 +1115,14 @@ def fit(self, X, y=None): self : object Returns self. """ + if not is_scalar_nan(self.missing_values): + force_all_finite = True + else: + force_all_finite = "allow-nan" X = check_array(X, accept_sparse=('csc', 'csr'), - force_all_finite='allow-nan') + force_all_finite=force_all_finite) + _check_inputs_dtype(X, self.missing_values) + self._n_features = X.shape[1] if self.features not in ('missing-only', 'all'): @@ -1151,8 +1157,14 @@ def transform(self, X): """ check_is_fitted(self, "features_") + if not is_scalar_nan(self.missing_values): + force_all_finite = True + else: + force_all_finite = "allow-nan" X = check_array(X, accept_sparse=('csc', 'csr'), - force_all_finite='allow-nan') + force_all_finite=force_all_finite) + _check_inputs_dtype(X, self.missing_values) + if X.shape[1] != self._n_features: raise ValueError("X has a different number of features " "than during fitting.") From 34fb9a3de5495dd56068605ed28f72e5fdf1aaea Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jun 2018 17:09:09 +0200 Subject: [PATCH 43/49] address jeremy comments --- sklearn/impute.py | 8 ++++---- sklearn/tests/test_impute.py | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 1c30442d5952a..999689805b65d 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -45,10 +45,10 @@ def _check_inputs_dtype(X, missing_values): missing_values.""" if (X.dtype.kind in ("f", "i", "u") and not isinstance(missing_values, numbers.Real)): - raise TypeError("The data type of 'missing_values' and 'X' are " - "not compatible. 'missing_values' data type is " - "{} and 'X' is {}." - .format(type(missing_values), X.dtype)) + raise ValueError("The data type of 'missing_values' and 'X' are " + "not compatible. 'missing_values' data type is " + "{} and 'X' is {}." + .format(type(missing_values), X.dtype)) def _get_mask(X, value_to_mask): diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 672c87b8d2fdf..906507cba93bd 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -710,18 +710,18 @@ def test_chained_imputer_additive_matrix(): @pytest.mark.parametrize("imputer_constructor", [SimpleImputer, ChainedImputer]) @pytest.mark.parametrize( - "missing_values, X_missing_value, err_type, err_msg", - [("NaN", np.nan, ValueError, "contains"), - ("-1", -1, TypeError, "not compatible")]) + "missing_values, X_missing_value, err_msg", + [("NaN", np.nan, "contains"), + ("-1", -1, "not compatible")]) def test_inconsistent_dtype_X_missing_values(imputer_constructor, missing_values, X_missing_value, - err_type, err_msg): + err_msg): # regression test for issue #11390. Comparison between incoherent dtype # for X and missing_values was not raising a proper error. - X = np.random.randn(1000, 10) + X = np.random.randn(10, 10) X[0, 0] = X_missing_value imputer = imputer_constructor(missing_values=missing_values) - with pytest.raises(err_type, match=err_msg): + with pytest.raises(ValueError, match=err_msg): imputer.fit_transform(X) From 3abc6951fc8e47a466023d7023771ed80d822c45 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jun 2018 19:04:22 +0200 Subject: [PATCH 44/49] address andy comments --- sklearn/tests/test_impute.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 906507cba93bd..4b48a768b3ba9 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -711,8 +711,9 @@ def test_chained_imputer_additive_matrix(): [SimpleImputer, ChainedImputer]) @pytest.mark.parametrize( "missing_values, X_missing_value, err_msg", - [("NaN", np.nan, "contains"), - ("-1", -1, "not compatible")]) + [("NaN", np.nan, "Input contains NaN"), + ("-1", -1, "The data type of 'missing_values' and 'X' are not compatible") + ]) def test_inconsistent_dtype_X_missing_values(imputer_constructor, missing_values, X_missing_value, err_msg): From 05226fd69017c9cf508a8b4ac4573ba08530f23c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jun 2018 19:35:40 +0200 Subject: [PATCH 45/49] PEP8 --- sklearn/tests/test_impute.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 4b48a768b3ba9..c4145f76d972e 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -712,8 +712,7 @@ def test_chained_imputer_additive_matrix(): @pytest.mark.parametrize( "missing_values, X_missing_value, err_msg", [("NaN", np.nan, "Input contains NaN"), - ("-1", -1, "The data type of 'missing_values' and 'X' are not compatible") - ]) + ("-1", -1, "data type of 'missing_values' and 'X' are not compatible")]) def test_inconsistent_dtype_X_missing_values(imputer_constructor, missing_values, X_missing_value, err_msg): From 7695551d6e507985b96a3a787a02465f2c5d1969 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jun 2018 22:59:55 +0200 Subject: [PATCH 46/49] DOC fix doc parameter --- sklearn/impute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 1f3a6e131a965..d3eb54c1a98a7 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1030,7 +1030,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): >>> indicator = MissingIndicator() >>> indicator.fit(X1) MissingIndicator(error_on_new=True, features='missing-only', - missing_values='NaN', sparse='auto') + missing_values=nan, sparse='auto') >>> X2_tr = indicator.transform(X2) >>> X2_tr array([[False, True], From d4ca8a823a046cf382b50f6b35137e980bede0d4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Jul 2018 22:41:39 +0200 Subject: [PATCH 47/49] EXA show an example using MissingIndicator --- examples/plot_missing_values.py | 37 +++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index d238a16592edb..10c46733188d5 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -4,15 +4,19 @@ ==================================================== Missing values can be replaced by the mean, the median or the most frequent -value using the basic ``SimpleImputer``. +value using the basic :func:`sklearn.impute.SimpleImputer`. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). -Another option is the ``ChainedImputer``. This uses round-robin linear -regression, treating every variable as an output in turn. The version -implemented assumes Gaussian (output) variables. If your features are obviously -non-Normal, consider transforming them to look more Normal so as to improve -performance. +Another option is the :func:`sklearn.impute.ChainedImputer`. This uses +round-robin linear regression, treating every variable as an output in +turn. The version implemented assumes Gaussian (output) variables. If your +features are obviously non-Normal, consider transforming them to look more +Normal so as to improve performance. + +In addition of using an imputing method, we can also keep an indication of the +missing information using :func:`sklearn.impute.MissingIndicator` which might +carry some information. """ import numpy as np @@ -21,9 +25,10 @@ from sklearn.datasets import load_diabetes from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor -from sklearn.pipeline import Pipeline -from sklearn.impute import SimpleImputer, ChainedImputer +from sklearn.pipeline import make_pipeline, make_union +from sklearn.impute import SimpleImputer, ChainedImputer, MissingIndicator from sklearn.model_selection import cross_val_score +from sklearn.metrics import mean_absolute_error rng = np.random.RandomState(0) @@ -60,18 +65,18 @@ def get_results(dataset): X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() - estimator = Pipeline([("imputer", SimpleImputer(missing_values=0, - strategy="mean")), - ("forest", RandomForestRegressor(random_state=0, - n_estimators=100))]) + estimator = make_pipeline( + make_union(SimpleImputer(missing_values=0, strategy="mean"), + MissingIndicator(missing_values=0)), + RandomForestRegressor(random_state=0, n_estimators=100)) mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after chained imputation of the missing values - estimator = Pipeline([("imputer", ChainedImputer(missing_values=0, - random_state=0)), - ("forest", RandomForestRegressor(random_state=0, - n_estimators=100))]) + estimator = make_pipeline( + make_union(ChainedImputer(missing_values=0, random_state=0), + MissingIndicator(missing_values=0)), + RandomForestRegressor(random_state=0, n_estimators=100)) chained_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') From 52d1c022769fcef278277c1dc577eaeee3c2f8d4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Jul 2018 23:02:57 +0200 Subject: [PATCH 48/49] Update plot_missing_values.py --- examples/plot_missing_values.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 10c46733188d5..8cd20087dfb0f 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -28,7 +28,6 @@ from sklearn.pipeline import make_pipeline, make_union from sklearn.impute import SimpleImputer, ChainedImputer, MissingIndicator from sklearn.model_selection import cross_val_score -from sklearn.metrics import mean_absolute_error rng = np.random.RandomState(0) From 76558e5d36085e8380f5c8c067fc3181971ca805 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 15 Jul 2018 23:05:11 +0200 Subject: [PATCH 49/49] DOC fix --- sklearn/impute.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 4b3eb7a5143bb..d3d1c314347b1 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1007,8 +1007,8 @@ class MissingIndicator(BaseEstimator, TransformerMixin): error_on_new : boolean, optional If True (default), transform will raise an error when there are - features with missing values in transform but have no missing values in - fit This is applicable only when ``features="missing-only"``. + features with missing values in transform that have no missing values + in fit This is applicable only when ``features="missing-only"``. Attributes ----------