diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 6de5df8b12729..fa8ee3bd4574b 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -45,10 +45,11 @@ that contain the missing values:: >>> import numpy as np >>> from sklearn.impute import SimpleImputer >>> imp = SimpleImputer(missing_values=np.nan, strategy='mean') - >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]]) # doctest: +NORMALIZE_WHITESPACE - SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0) + >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]]) # doctest: +NORMALIZE_WHITESPACE + SimpleImputer(add_indicator=False, copy=True, fill_value=None, + missing_values=nan, strategy='mean', verbose=0) >>> X = [[np.nan, 2], [6, np.nan], [7, 6]] - >>> print(imp.transform(X)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + >>> print(imp.transform(X)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS [[4. 2. ] [6. 3.666...] [7. 6. ]] @@ -59,9 +60,10 @@ The :class:`SimpleImputer` class also supports sparse matrices:: >>> X = sp.csc_matrix([[1, 2], [0, -1], [8, 4]]) >>> imp = SimpleImputer(missing_values=-1, strategy='mean') >>> imp.fit(X) # doctest: +NORMALIZE_WHITESPACE - SimpleImputer(copy=True, fill_value=None, missing_values=-1, strategy='mean', verbose=0) + SimpleImputer(add_indicator=False, copy=True, fill_value=None, + missing_values=-1, strategy='mean', verbose=0) >>> X_test = sp.csc_matrix([[-1, 2], [6, -1], [7, 6]]) - >>> print(imp.transform(X_test).toarray()) # doctest: +NORMALIZE_WHITESPACE + >>> print(imp.transform(X_test).toarray()) # doctest: +NORMALIZE_WHITESPACE [[3. 2.] [6. 3.] [7. 6.]] diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 9d815abd06fc0..b2df99d9a131b 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -255,6 +255,12 @@ Support for Python 3.4 and below has been officially dropped. used to be kept if there were no missing values at all. :issue:`13562` by :user:`Jérémie du Boisberranger `. +- |Feature| The :class:`impute.SimpleImputer` has a new parameter + ``'add_indicator'``, which simply stacks a :class:`impute.MissingIndicator` + transform into the output of the imputer's transform. That allows a predictive + estimator to account for missingness. :issue:`12583` by + :user:`Danylo Baibak `. + :mod:`sklearn.isotonic` ....................... diff --git a/sklearn/impute.py b/sklearn/impute.py index 39550e242889f..7994e6ab57a88 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -141,13 +141,26 @@ class SimpleImputer(BaseEstimator, TransformerMixin): a new copy will always be made, even if `copy=False`: - If X is not an array of floating values; - - If X is encoded as a CSR matrix. + - If X is encoded as a CSR matrix; + - If add_indicator=True. + + add_indicator : boolean, optional (default=False) + If True, a `MissingIndicator` transform will stack onto output + of the imputer's transform. This allows a predictive estimator + to account for missingness despite imputation. If a feature has no + missing values at fit/train time, the feature won't appear on + the missing indicator even if there are missing values at + transform/test time. Attributes ---------- statistics_ : array of shape (n_features,) The imputation fill value for each feature. + indicator_ : :class:`sklearn.impute.MissingIndicator` + Indicator used to add binary indicators for missing values. + ``None`` if add_indicator is False. + See also -------- IterativeImputer : Multivariate imputation of missing values. @@ -159,8 +172,8 @@ class SimpleImputer(BaseEstimator, TransformerMixin): >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) ... # doctest: +NORMALIZE_WHITESPACE - SimpleImputer(copy=True, fill_value=None, missing_values=nan, - strategy='mean', verbose=0) + SimpleImputer(add_indicator=False, copy=True, fill_value=None, + missing_values=nan, strategy='mean', verbose=0) >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] >>> print(imp_mean.transform(X)) ... # doctest: +NORMALIZE_WHITESPACE @@ -175,12 +188,13 @@ class SimpleImputer(BaseEstimator, TransformerMixin): """ def __init__(self, missing_values=np.nan, strategy="mean", - fill_value=None, verbose=0, copy=True): + fill_value=None, verbose=0, copy=True, add_indicator=False): self.missing_values = missing_values self.strategy = strategy self.fill_value = fill_value self.verbose = verbose self.copy = copy + self.add_indicator = add_indicator def _validate_input(self, X): allowed_strategies = ["mean", "median", "most_frequent", "constant"] @@ -272,6 +286,13 @@ def fit(self, X, y=None): self.missing_values, fill_value) + if self.add_indicator: + self.indicator_ = MissingIndicator( + missing_values=self.missing_values) + self.indicator_.fit(X) + else: + self.indicator_ = None + return self def _sparse_fit(self, X, strategy, missing_values, fill_value): @@ -285,7 +306,6 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value): # for constant strategy, self.statistcs_ is used to store # fill_value in each column statistics.fill(fill_value) - else: for i in range(X.shape[1]): column = X.data[X.indptr[i]:X.indptr[i + 1]] @@ -382,6 +402,9 @@ def transform(self, X): raise ValueError("X has %d features per sample, expected %d" % (X.shape[1], self.statistics_.shape[0])) + if self.add_indicator: + X_trans_indicator = self.indicator_.transform(X) + # Delete the invalid columns if strategy is not constant if self.strategy == "constant": valid_statistics = statistics @@ -420,6 +443,10 @@ def transform(self, X): X[coordinates] = values + if self.add_indicator: + hstack = sparse.hstack if sparse.issparse(X) else np.hstack + X = hstack((X, X_trans_indicator)) + return X def _more_tags(self): diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 5fe81b8044934..8abccf71e978e 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -952,15 +952,15 @@ def test_missing_indicator_error(X_fit, X_trans, params, msg_err): ]) @pytest.mark.parametrize( "param_features, n_features, features_indices", - [('missing-only', 2, np.array([0, 1])), + [('missing-only', 3, np.array([0, 1, 2])), ('all', 3, np.array([0, 1, 2]))]) def test_missing_indicator_new(missing_values, arr_type, dtype, param_features, n_features, features_indices): X_fit = np.array([[missing_values, missing_values, 1], - [4, missing_values, 2]]) + [4, 2, missing_values]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) - X_fit_expected = np.array([[1, 1, 0], [0, 1, 0]]) + X_fit_expected = np.array([[1, 1, 0], [0, 0, 1]]) X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]]) # convert the input to the right array format and right dtype @@ -1144,3 +1144,54 @@ def test_missing_indicator_sparse_no_explicit_zeros(): Xt = mi.fit_transform(X) assert Xt.getnnz() == Xt.sum() + + +@pytest.mark.parametrize("marker", [np.nan, -1, 0]) +def test_imputation_add_indicator(marker): + X = np.array([ + [marker, 1, 5, marker, 1], + [2, marker, 1, marker, 2], + [6, 3, marker, marker, 3], + [1, 2, 9, marker, 4] + ]) + X_true = np.array([ + [3., 1., 5., 1., 1., 0., 0., 1.], + [2., 2., 1., 2., 0., 1., 0., 1.], + [6., 3., 5., 3., 0., 0., 1., 1.], + [1., 2., 9., 4., 0., 0., 0., 1.] + ]) + + imputer = SimpleImputer(missing_values=marker, add_indicator=True) + X_trans = imputer.fit_transform(X) + + assert_allclose(X_trans, X_true) + assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3])) + + +@pytest.mark.parametrize( + "arr_type", + [ + sparse.csc_matrix, sparse.csr_matrix, sparse.coo_matrix, + sparse.lil_matrix, sparse.bsr_matrix + ] +) +def test_imputation_add_indicator_sparse_matrix(arr_type): + X_sparse = arr_type([ + [np.nan, 1, 5], + [2, np.nan, 1], + [6, 3, np.nan], + [1, 2, 9] + ]) + X_true = np.array([ + [3., 1., 5., 1., 0., 0.], + [2., 2., 1., 0., 1., 0.], + [6., 3., 5., 0., 0., 1.], + [1., 2., 9., 0., 0., 0.], + ]) + + imputer = SimpleImputer(missing_values=np.nan, add_indicator=True) + X_trans = imputer.fit_transform(X_sparse) + + assert sparse.issparse(X_trans) + assert X_trans.shape == X_true.shape + assert_allclose(X_trans.toarray(), X_true)