From fa148c983e47d27836a17cb07c968d479d464fd8 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 22 Jan 2019 16:26:47 +1100 Subject: [PATCH 1/9] Debugging a doctest heisenbug: Add unit test equivalent --- sklearn/tests/test_impute.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index fd2bbd4ec5ad0..f9d0cae82c64a 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -931,3 +931,14 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor, with pytest.raises(ValueError, match=err_msg): imputer.fit_transform(X) + + +def test_heisenbug(): + imp = IterativeImputer(n_iter=10, random_state=0) + imp.fit([[1, 2], [np.nan, 3], [7, np.nan]]) + X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] + assert_allclose( + np.round(imp.transform(X_test)), + [[1., 2.] + [6., 3.] + [26., 6.]]) From 09bb175d2dda49b813490453c92daa96655d4af3 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 22 Jan 2019 17:04:05 +1100 Subject: [PATCH 2/9] Fix copy-paste error --- sklearn/tests/test_impute.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index f9d0cae82c64a..124a75368b752 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -939,6 +939,6 @@ def test_heisenbug(): X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] assert_allclose( np.round(imp.transform(X_test)), - [[1., 2.] - [6., 3.] + [[1., 2.], + [6., 3.], [26., 6.]]) From c30412073533fe4d184d17dc3c113c7efd7907a1 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 22 Jan 2019 18:28:53 +1100 Subject: [PATCH 3/9] Some debug output --- sklearn/impute.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/impute.py b/sklearn/impute.py index f888d7fe83d4f..d7ac2fa0fc347 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -968,6 +968,9 @@ def transform(self, X): predictor=predictor_triplet.predictor, fit_mode=False ) + print(it, 'int:', predictor_triplet.predictor.intercept_, + 'coef:', predictor_triplet.predictor.coef_) + print('out:', Xt) if not (it + 1) % imputations_per_round: if self.verbose > 1: print('[IterativeImputer] Ending imputation round ' From 2f5694c33f2b664ceb3a48cb004a1bb7c57d2adc Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 22 Jan 2019 18:55:12 +1100 Subject: [PATCH 4/9] Show transformation during training --- sklearn/impute.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/impute.py b/sklearn/impute.py index d7ac2fa0fc347..f16821a0c272f 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -917,6 +917,7 @@ def fit_transform(self, X, y=None): Xt, predictor = self._impute_one_feature( Xt, mask_missing_values, feat_idx, neighbor_feat_idx, predictor=None, fit_mode=True) + print(Xt) predictor_triplet = ImputerTriplet(feat_idx, neighbor_feat_idx, predictor) From efea77888ecfdb024f73a92c787c587dcb35b3d3 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 22 Jan 2019 19:02:55 +1100 Subject: [PATCH 5/9] Show alpha --- sklearn/impute.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index f16821a0c272f..cd9632dc70156 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -970,7 +970,8 @@ def transform(self, X): fit_mode=False ) print(it, 'int:', predictor_triplet.predictor.intercept_, - 'coef:', predictor_triplet.predictor.coef_) + 'coef:', predictor_triplet.predictor.coef_, + 'alpha:', predictor_triplet.predictor.alpha_) print('out:', Xt) if not (it + 1) % imputations_per_round: if self.verbose > 1: From 83c481df29af98771454a784e018ec8639a2292e Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 23 Jan 2019 11:38:07 +1100 Subject: [PATCH 6/9] Make the example more obvious --- doc/modules/impute.rst | 7 +- sklearn/cluster/tests/test_bicluster.py | 19 ++ sklearn/impute.py | 410 +++++++++++------------- sklearn/tests/test_impute.py | 11 - 4 files changed, 212 insertions(+), 235 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 1d1f6e926e8f8..76d170e506c2a 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -106,16 +106,17 @@ round are returned. >>> import numpy as np >>> from sklearn.impute import IterativeImputer >>> imp = IterativeImputer(n_iter=10, random_state=0) - >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE + >>> imp.fit([[1, 2], [3, 6], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE IterativeImputer(imputation_order='ascending', initial_strategy='mean', max_value=None, min_value=None, missing_values=nan, n_iter=10, n_nearest_features=None, predictor=None, random_state=0, sample_posterior=False, verbose=False) + # the model learns that the second feature is double the first >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] - [ 6. 3.] - [26. 6.]] + [ 6. 12.] + [ 3. 6.]] Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index dd5e91c18c27e..40ab9f8961667 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -232,6 +232,25 @@ def test_perfect_checkerboard(): (rows, cols)), 1) +def test_compare_svd_methods(): + data = np.array([[-2, -4, 2], + [-2, 1, 2], + [4, 2, 5]]) + + model_rand = SpectralCoclustering(n_clusters=2, + svd_method='randomized', + random_state=0) + model_rand.fit(data) + + model_arpack = SpectralCoclustering(n_clusters=2, + svd_method='arpack', + random_state=0) + model_arpack.fit(data) + + assert_array_equal(model_rand.column_labels_, + model_arpack.column_labels_) + + def test_errors(): data = np.arange(25).reshape((5, 5)) diff --git a/sklearn/impute.py b/sklearn/impute.py index cd9632dc70156..fec9d8b0d7a8d 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -6,13 +6,11 @@ from __future__ import division import warnings -import numbers from time import time -from distutils.version import LooseVersion +import numbers import numpy as np import numpy.ma as ma -import scipy from scipy import sparse from scipy import stats from collections import namedtuple @@ -27,6 +25,10 @@ from .utils.fixes import _object_dtype_isnan from .utils import is_scalar_nan +from .externals import six + +zip = six.moves.zip +map = six.moves.map ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx', 'neighbor_feat_idx', @@ -35,7 +37,7 @@ __all__ = [ 'MissingIndicator', 'SimpleImputer', - 'IterativeImputer', + 'ChainedImputer', ] @@ -140,6 +142,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin): a new copy will always be made, even if `copy=False`: - If X is not an array of floating values; + - If X is sparse and `missing_values=0`; - If X is encoded as a CSR matrix. Attributes @@ -147,26 +150,6 @@ class SimpleImputer(BaseEstimator, TransformerMixin): statistics_ : array of shape (n_features,) The imputation fill value for each feature. - See also - -------- - IterativeImputer : Multivariate imputation of missing values. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.impute import SimpleImputer - >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') - >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) - ... # doctest: +NORMALIZE_WHITESPACE - SimpleImputer(copy=True, fill_value=None, missing_values=nan, - strategy='mean', verbose=0) - >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] - >>> print(imp_mean.transform(X)) - ... # doctest: +NORMALIZE_WHITESPACE - [[ 7. 2. 3. ] - [ 4. 3.5 6. ] - [10. 3.5 9. ]] - Notes ----- Columns which only contained missing values at `fit` are discarded upon @@ -254,17 +237,10 @@ def fit(self, X, y=None): "data".format(fill_value)) if sparse.issparse(X): - # missing_values = 0 not allowed with sparse data as it would - # force densification - if self.missing_values == 0: - raise ValueError("Imputation not possible when missing_values " - "== 0 and input is sparse. Provide a dense " - "array instead.") - else: - self.statistics_ = self._sparse_fit(X, - self.strategy, - self.missing_values, - fill_value) + self.statistics_ = self._sparse_fit(X, + self.strategy, + self.missing_values, + fill_value) else: self.statistics_ = self._dense_fit(X, self.strategy, @@ -275,41 +251,80 @@ def fit(self, X, y=None): def _sparse_fit(self, X, strategy, missing_values, fill_value): """Fit the transformer on sparse data.""" - mask_data = _get_mask(X.data, missing_values) - n_implicit_zeros = X.shape[0] - np.diff(X.indptr) + # Count the zeros + if missing_values == 0: + n_zeros_axis = np.zeros(X.shape[1], dtype=int) + else: + n_zeros_axis = X.shape[0] - np.diff(X.indptr) - statistics = np.empty(X.shape[1]) + # Mean + if strategy == "mean": + if missing_values != 0: + n_non_missing = n_zeros_axis + + # Mask the missing elements + mask_missing_values = _get_mask(X.data, missing_values) + mask_valids = np.logical_not(mask_missing_values) + + # Sum only the valid elements + new_data = X.data.copy() + new_data[mask_missing_values] = 0 + X = sparse.csc_matrix((new_data, X.indices, X.indptr), + copy=False) + sums = X.sum(axis=0) + + # Count the elements != 0 + mask_non_zeros = sparse.csc_matrix( + (mask_valids.astype(np.float64), + X.indices, + X.indptr), copy=False) + s = mask_non_zeros.sum(axis=0) + n_non_missing = np.add(n_non_missing, s) + + else: + sums = X.sum(axis=0) + n_non_missing = np.diff(X.indptr) - if strategy == "constant": - # for constant strategy, self.statistcs_ is used to store - # fill_value in each column - statistics.fill(fill_value) + # Ignore the error, columns with a np.nan statistics_ + # are not an error at this point. These columns will + # be removed in transform + with np.errstate(all="ignore"): + return np.ravel(sums) / np.ravel(n_non_missing) + # Median + Most frequent + Constant else: - for i in range(X.shape[1]): - column = X.data[X.indptr[i]:X.indptr[i + 1]] - mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]] - column = column[~mask_column] - - # combine explicit and implicit zeros - mask_zeros = _get_mask(column, 0) - column = column[~mask_zeros] - n_explicit_zeros = mask_zeros.sum() - n_zeros = n_implicit_zeros[i] + n_explicit_zeros - - if strategy == "mean": - s = column.size + n_zeros - statistics[i] = np.nan if s == 0 else column.sum() / s - - elif strategy == "median": - statistics[i] = _get_median(column, - n_zeros) - - elif strategy == "most_frequent": - statistics[i] = _most_frequent(column, - 0, - n_zeros) - return statistics + # Remove the missing values, for each column + columns_all = np.hsplit(X.data, X.indptr[1:-1]) + mask_missing_values = _get_mask(X.data, missing_values) + mask_valids = np.hsplit(np.logical_not(mask_missing_values), + X.indptr[1:-1]) + + # astype necessary for bug in numpy.hsplit before v1.9 + columns = [col[mask.astype(bool, copy=False)] + for col, mask in zip(columns_all, mask_valids)] + + # Median + if strategy == "median": + median = np.empty(len(columns)) + for i, column in enumerate(columns): + median[i] = _get_median(column, n_zeros_axis[i]) + + return median + + # Most frequent + elif strategy == "most_frequent": + most_frequent = np.empty(len(columns)) + + for i, column in enumerate(columns): + most_frequent[i] = _most_frequent(column, + 0, + n_zeros_axis[i]) + + return most_frequent + + # Constant + elif strategy == "constant": + return np.full(X.shape[1], fill_value) def _dense_fit(self, X, strategy, missing_values, fill_value): """Fit the transformer on dense data.""" @@ -359,8 +374,6 @@ def _dense_fit(self, X, strategy, missing_values, fill_value): # Constant elif strategy == "constant": - # for constant strategy, self.statistcs_ is used to store - # fill_value in each column return np.full(X.shape[1], fill_value, dtype=X.dtype) def transform(self, X): @@ -368,7 +381,7 @@ def transform(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape = [n_samples, n_features] The input data to complete. """ check_is_fitted(self, 'statistics_') @@ -399,19 +412,17 @@ def transform(self, X): X = X[:, valid_statistics_indexes] # Do actual imputation - if sparse.issparse(X): - if self.missing_values == 0: - raise ValueError("Imputation not possible when missing_values " - "== 0 and input is sparse. Provide a dense " - "array instead.") - else: - mask = _get_mask(X.data, self.missing_values) - indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), - np.diff(X.indptr))[mask] + if sparse.issparse(X) and self.missing_values != 0: + mask = _get_mask(X.data, self.missing_values) + indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), + np.diff(X.indptr))[mask] - X.data[mask] = valid_statistics[indexes].astype(X.dtype, - copy=False) + X.data[mask] = valid_statistics[indexes].astype(X.dtype, + copy=False) else: + if sparse.issparse(X): + X = X.toarray() + mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=0) values = np.repeat(valid_statistics, n_missing) @@ -422,13 +433,14 @@ def transform(self, X): return X -class IterativeImputer(BaseEstimator, TransformerMixin): - """Multivariate imputer that estimates each feature from all the others. +class ChainedImputer(BaseEstimator, TransformerMixin): + """Chained imputer transformer to impute missing values. - A strategy for imputing missing values by modeling each feature with - missing values as a function of other features in a round-robin fashion. + Basic implementation of chained imputer from MICE (Multivariate + Imputations by Chained Equations) package from R. This version assumes all + of the features are Gaussian. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- @@ -450,34 +462,24 @@ class IterativeImputer(BaseEstimator, TransformerMixin): "random" A random order for each round. - n_iter : int, optional (default=10) - Number of imputation rounds to perform before returning the imputations - computed during the final round. A round is a single imputation of each - feature with missing values. + n_imputations : int, optional (default=100) + Number of chained imputation rounds to perform, the results of which + will be used in the final average. + + n_burn_in : int, optional (default=10) + Number of initial imputation rounds to perform the results of which + will not be returned. - predictor : estimator object, default=RidgeCV() or BayesianRidge() + predictor : estimator object, default=BayesianRidge() The predictor to use at each step of the round-robin imputation. - If ``sample_posterior`` is True, the predictor must support - ``return_std`` in its ``predict`` method. Also, if - ``sample_posterior=True`` the default predictor will be - :class:`sklearn.linear_model.BayesianRidge` and - :class:`sklearn.linear_model.RidgeCV` otherwise. - - sample_posterior : boolean, default=False - Whether to sample from the (Gaussian) predictive posterior of the - fitted predictor for each imputation. Predictor must support - ``return_std`` in its ``predict`` method if set to ``True``. Set to - ``True`` if using ``IterativeImputer`` for multiple imputations. + It must support ``return_std`` in its ``predict`` method. n_nearest_features : int, optional (default=None) Number of other features to use to estimate the missing values of - each feature column. Nearness between features is measured using + the each feature column. Nearness between features is measured using the absolute correlation coefficient between each feature pair (after - initial imputation). To ensure coverage of features throughout the - imputation process, the neighbor features are not necessarily nearest, - but are drawn with probability proportional to correlation for each - imputed target feature. Can provide significant speed-up when the - number of features is huge. If ``None``, all features will be used. + initial imputation). Can provide significant speed-up when the number + of features is huge. If ``None``, all features will be used. initial_strategy : str, optional (default="mean") Which strategy to use to initialize the missing values. Same as the @@ -498,43 +500,37 @@ class IterativeImputer(BaseEstimator, TransformerMixin): or 2. random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use. Randomizes - selection of predictor features if n_nearest_features is not None, the - ``imputation_order`` if ``random``, and the sampling from posterior if - ``sample_posterior`` is True. Use an integer for determinism. - See :term:`the Glossary `. + The seed of the pseudo random number generator to use when shuffling + the data. If int, random_state is the seed used by the random number + generator; If RandomState instance, random_state is the random number + generator; If None, the random number generator is the RandomState + instance used by ``np.random``. Attributes ---------- - initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer` - Imputer used to initialize the missing values. + initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`' + The imputer used to initialize the missing values. imputation_sequence_ : list of tuples Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where ``feat_idx`` is the current feature to be imputed, ``neighbor_feat_idx`` is the array of other features used to impute the current feature, and ``predictor`` is the trained predictor used for - the imputation. Length is ``self.n_features_with_missing_ * n_iter``. - - n_features_with_missing_ : int - Number of features with missing values. - - See also - -------- - SimpleImputer : Univariate imputation of missing values. + the imputation. Notes ----- - To support imputation in inductive mode we store each feature's predictor - during the ``fit`` phase, and predict without refitting (in order) during - the ``transform`` phase. + The R version of MICE does not have inductive functionality, i.e. first + fitting on ``X_train`` and then transforming any ``X_test`` without + additional fitting. We do this by storing each feature's predictor during + the round-robin ``fit`` phase, and predicting without refitting (in order) + during the ``transform`` phase. Features which contain all missing values at ``fit`` are discarded upon ``transform``. - Features with missing values during ``transform`` which did not have any - missing values during ``fit`` will be imputed with the initial imputation - method only. + Features with missing values in transform which did not have any missing + values in fit will be imputed with the initial imputation method only. References ---------- @@ -542,19 +538,14 @@ class IterativeImputer(BaseEstimator, TransformerMixin): Multivariate Imputation by Chained Equations in R". Journal of Statistical Software 45: 1-67. `_ - - .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in - Multivariate Data Suitable for use with an Electronic Computer". - Journal of the Royal Statistical Society 22(2): 302-306. - `_ """ def __init__(self, missing_values=np.nan, imputation_order='ascending', - n_iter=10, + n_imputations=100, + n_burn_in=10, predictor=None, - sample_posterior=False, n_nearest_features=None, initial_strategy="mean", min_value=None, @@ -564,9 +555,9 @@ def __init__(self, self.missing_values = missing_values self.imputation_order = imputation_order - self.n_iter = n_iter + self.n_imputations = n_imputations + self.n_burn_in = n_burn_in self.predictor = predictor - self.sample_posterior = sample_posterior self.n_nearest_features = n_nearest_features self.initial_strategy = initial_strategy self.min_value = min_value @@ -604,8 +595,7 @@ def _impute_one_feature(self, predictor : object The predictor to use at this step of the round-robin imputation. - If ``sample_posterior`` is True, the predictor must support - ``return_std`` in its ``predict`` method. + It must support ``return_std`` in its ``predict`` method. If None, it will be cloned from self._predictor. fit_mode : boolean, default=True @@ -644,34 +634,17 @@ def _impute_one_feature(self, # get posterior samples X_test = safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask) - if self.sample_posterior: - mus, sigmas = predictor.predict(X_test, return_std=True) - good_sigmas = sigmas > 0 - imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) - imputed_values[~good_sigmas] = mus[~good_sigmas] - mus = mus[good_sigmas] - sigmas = sigmas[good_sigmas] - a = (self._min_value - mus) / sigmas - b = (self._max_value - mus) / sigmas - - if scipy.__version__ < LooseVersion('0.18'): - # bug with vector-valued `a` in old scipy - imputed_values[good_sigmas] = [ - stats.truncnorm(a=a_, b=b_, - loc=loc_, scale=scale_).rvs( - random_state=self.random_state_) - for a_, b_, loc_, scale_ - in zip(a, b, mus, sigmas)] - else: - truncated_normal = stats.truncnorm(a=a, b=b, - loc=mus, scale=sigmas) - imputed_values[good_sigmas] = truncated_normal.rvs( - random_state=self.random_state_) - else: - imputed_values = predictor.predict(X_test) - imputed_values = np.clip(imputed_values, - self._min_value, - self._max_value) + mus, sigmas = predictor.predict(X_test, return_std=True) + good_sigmas = sigmas > 0 + imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) + imputed_values[~good_sigmas] = mus[~good_sigmas] + imputed_values[good_sigmas] = self.random_state_.normal( + loc=mus[good_sigmas], scale=sigmas[good_sigmas]) + + # clip the values + imputed_values = np.clip(imputed_values, + self._min_value, + self._max_value) # update the feature X_filled[missing_row_mask, feat_idx] = imputed_values @@ -862,51 +835,44 @@ def fit_transform(self, X, y=None): self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) - if self.n_iter < 0: - raise ValueError( - "'n_iter' should be a positive integer. Got {} instead." - .format(self.n_iter)) - if self.predictor is None: - if self.sample_posterior: - from .linear_model import BayesianRidge - self._predictor = BayesianRidge() - else: - from .linear_model import RidgeCV - # including a very small alpha to approximate OLS - self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1, 1, 10])) + from .linear_model import BayesianRidge + self._predictor = BayesianRidge() else: self._predictor = clone(self.predictor) - if hasattr(self._predictor, 'random_state'): - self._predictor.random_state = self.random_state_ - - self._min_value = -np.inf if self.min_value is None else self.min_value - self._max_value = np.inf if self.max_value is None else self.max_value + self._min_value = np.nan if self.min_value is None else self.min_value + self._max_value = np.nan if self.max_value is None else self.max_value self.initial_imputer_ = None - X, Xt, mask_missing_values = self._initial_imputation(X) + X, X_filled, mask_missing_values = self._initial_imputation(X) + + # edge case: in case the user specifies 0 for n_imputations, + # then there is no need to do burn in and the result should be + # just the initial imputation (before clipping) + if self.n_imputations < 1: + return X_filled - if self.n_iter == 0: - return Xt + X_filled = np.clip(X_filled, self._min_value, self._max_value) # order in which to impute # note this is probably too slow for large feature data (d > 100000) # and a better way would be good. # see: https://goo.gl/KyCNwj and subsequent comments ordered_idx = self._get_ordered_idx(mask_missing_values) - self.n_features_with_missing_ = len(ordered_idx) - abs_corr_mat = self._get_abs_corr_mat(Xt) + abs_corr_mat = self._get_abs_corr_mat(X_filled) # impute data - n_samples, n_features = Xt.shape + n_rounds = self.n_burn_in + self.n_imputations + n_samples, n_features = X_filled.shape + Xt = np.zeros((n_samples, n_features), dtype=X.dtype) self.imputation_sequence_ = [] if self.verbose > 0: - print("[IterativeImputer] Completing matrix with shape %s" + print("[ChainedImputer] Completing matrix with shape %s" % (X.shape,)) start_t = time() - for i_rnd in range(self.n_iter): + for i_rnd in range(n_rounds): if self.imputation_order == 'random': ordered_idx = self._get_ordered_idx(mask_missing_values) @@ -914,20 +880,22 @@ def fit_transform(self, X, y=None): neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, feat_idx, abs_corr_mat) - Xt, predictor = self._impute_one_feature( - Xt, mask_missing_values, feat_idx, neighbor_feat_idx, + X_filled, predictor = self._impute_one_feature( + X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, predictor=None, fit_mode=True) - print(Xt) predictor_triplet = ImputerTriplet(feat_idx, neighbor_feat_idx, predictor) self.imputation_sequence_.append(predictor_triplet) + if i_rnd >= self.n_burn_in: + Xt += X_filled if self.verbose > 0: - print('[IterativeImputer] Ending imputation round ' + print('[ChainedImputer] Ending imputation round ' '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, self.n_iter, time() - start_t)) + % (i_rnd + 1, n_rounds, time() - start_t)) + Xt /= self.n_imputations Xt[~mask_missing_values] = X[~mask_missing_values] return Xt @@ -949,37 +917,44 @@ def transform(self, X): """ check_is_fitted(self, 'initial_imputer_') - X, Xt, mask_missing_values = self._initial_imputation(X) + X, X_filled, mask_missing_values = self._initial_imputation(X) - if self.n_iter == 0: - return Xt + # edge case: in case the user specifies 0 for n_imputations, + # then there is no need to do burn in and the result should be + # just the initial imputation (before clipping) + if self.n_imputations < 1: + return X_filled - imputations_per_round = len(self.imputation_sequence_) // self.n_iter + X_filled = np.clip(X_filled, self._min_value, self._max_value) + + n_rounds = self.n_burn_in + self.n_imputations + n_imputations = len(self.imputation_sequence_) + imputations_per_round = n_imputations // n_rounds i_rnd = 0 + Xt = np.zeros(X.shape, dtype=X.dtype) if self.verbose > 0: - print("[IterativeImputer] Completing matrix with shape %s" + print("[ChainedImputer] Completing matrix with shape %s" % (X.shape,)) start_t = time() for it, predictor_triplet in enumerate(self.imputation_sequence_): - Xt, _ = self._impute_one_feature( - Xt, + X_filled, _ = self._impute_one_feature( + X_filled, mask_missing_values, predictor_triplet.feat_idx, predictor_triplet.neighbor_feat_idx, predictor=predictor_triplet.predictor, fit_mode=False ) - print(it, 'int:', predictor_triplet.predictor.intercept_, - 'coef:', predictor_triplet.predictor.coef_, - 'alpha:', predictor_triplet.predictor.alpha_) - print('out:', Xt) if not (it + 1) % imputations_per_round: + if i_rnd >= self.n_burn_in: + Xt += X_filled if self.verbose > 1: - print('[IterativeImputer] Ending imputation round ' + print('[ChainedImputer] Ending imputation round ' '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, self.n_iter, time() - start_t)) + % (i_rnd + 1, n_rounds, time() - start_t)) i_rnd += 1 + Xt /= self.n_imputations Xt[~mask_missing_values] = X[~mask_missing_values] return Xt @@ -1006,18 +981,11 @@ def fit(self, X, y=None): class MissingIndicator(BaseEstimator, TransformerMixin): """Binary indicators for missing values. - Note that this component typically should not not be used in a vanilla - :class:`Pipeline` consisting of transformers and a classifier, but rather - could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`. - - Read more in the :ref:`User Guide `. - Parameters ---------- missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of - `missing_values` will be indicated (True in the output array), the - other values will be marked as False. + `missing_values` will be imputed. features : str, optional Whether the imputer mask should represent all or a subset of @@ -1038,7 +1006,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): error_on_new : boolean, optional If True (default), transform will raise an error when there are features with missing values in transform that have no missing values - in fit. This is applicable only when ``features="missing-only"``. + in fit This is applicable only when ``features="missing-only"``. Attributes ---------- @@ -1058,7 +1026,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): ... [np.nan, 2, 3], ... [2, 4, 0]]) >>> indicator = MissingIndicator() - >>> indicator.fit(X1) # doctest: +NORMALIZE_WHITESPACE + >>> indicator.fit(X1) MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan, sparse='auto') >>> X2_tr = indicator.transform(X2) @@ -1159,7 +1127,7 @@ def fit(self, X, y=None): raise ValueError("'features' has to be either 'missing-only' or " "'all'. Got {} instead.".format(self.features)) - if not ((isinstance(self.sparse, str) and + if not ((isinstance(self.sparse, six.string_types) and self.sparse == "auto") or isinstance(self.sparse, bool)): raise ValueError("'sparse' has to be a boolean or 'auto'. " "Got {!r} instead.".format(self.sparse)) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 124a75368b752..fd2bbd4ec5ad0 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -931,14 +931,3 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor, with pytest.raises(ValueError, match=err_msg): imputer.fit_transform(X) - - -def test_heisenbug(): - imp = IterativeImputer(n_iter=10, random_state=0) - imp.fit([[1, 2], [np.nan, 3], [7, np.nan]]) - X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] - assert_allclose( - np.round(imp.transform(X_test)), - [[1., 2.], - [6., 3.], - [26., 6.]]) From d8b7008d5ca73796634c8b58e602ed9ff3faddb6 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 23 Jan 2019 11:54:26 +1100 Subject: [PATCH 7/9] Resurrect after a git typo --- sklearn/impute.py | 405 ++++++++++++++++++++++++---------------------- 1 file changed, 216 insertions(+), 189 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index fec9d8b0d7a8d..f888d7fe83d4f 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -6,11 +6,13 @@ from __future__ import division import warnings -from time import time import numbers +from time import time +from distutils.version import LooseVersion import numpy as np import numpy.ma as ma +import scipy from scipy import sparse from scipy import stats from collections import namedtuple @@ -25,10 +27,6 @@ from .utils.fixes import _object_dtype_isnan from .utils import is_scalar_nan -from .externals import six - -zip = six.moves.zip -map = six.moves.map ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx', 'neighbor_feat_idx', @@ -37,7 +35,7 @@ __all__ = [ 'MissingIndicator', 'SimpleImputer', - 'ChainedImputer', + 'IterativeImputer', ] @@ -142,7 +140,6 @@ class SimpleImputer(BaseEstimator, TransformerMixin): a new copy will always be made, even if `copy=False`: - If X is not an array of floating values; - - If X is sparse and `missing_values=0`; - If X is encoded as a CSR matrix. Attributes @@ -150,6 +147,26 @@ class SimpleImputer(BaseEstimator, TransformerMixin): statistics_ : array of shape (n_features,) The imputation fill value for each feature. + See also + -------- + IterativeImputer : Multivariate imputation of missing values. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.impute import SimpleImputer + >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') + >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) + ... # doctest: +NORMALIZE_WHITESPACE + SimpleImputer(copy=True, fill_value=None, missing_values=nan, + strategy='mean', verbose=0) + >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] + >>> print(imp_mean.transform(X)) + ... # doctest: +NORMALIZE_WHITESPACE + [[ 7. 2. 3. ] + [ 4. 3.5 6. ] + [10. 3.5 9. ]] + Notes ----- Columns which only contained missing values at `fit` are discarded upon @@ -237,10 +254,17 @@ def fit(self, X, y=None): "data".format(fill_value)) if sparse.issparse(X): - self.statistics_ = self._sparse_fit(X, - self.strategy, - self.missing_values, - fill_value) + # missing_values = 0 not allowed with sparse data as it would + # force densification + if self.missing_values == 0: + raise ValueError("Imputation not possible when missing_values " + "== 0 and input is sparse. Provide a dense " + "array instead.") + else: + self.statistics_ = self._sparse_fit(X, + self.strategy, + self.missing_values, + fill_value) else: self.statistics_ = self._dense_fit(X, self.strategy, @@ -251,80 +275,41 @@ def fit(self, X, y=None): def _sparse_fit(self, X, strategy, missing_values, fill_value): """Fit the transformer on sparse data.""" - # Count the zeros - if missing_values == 0: - n_zeros_axis = np.zeros(X.shape[1], dtype=int) - else: - n_zeros_axis = X.shape[0] - np.diff(X.indptr) + mask_data = _get_mask(X.data, missing_values) + n_implicit_zeros = X.shape[0] - np.diff(X.indptr) - # Mean - if strategy == "mean": - if missing_values != 0: - n_non_missing = n_zeros_axis - - # Mask the missing elements - mask_missing_values = _get_mask(X.data, missing_values) - mask_valids = np.logical_not(mask_missing_values) - - # Sum only the valid elements - new_data = X.data.copy() - new_data[mask_missing_values] = 0 - X = sparse.csc_matrix((new_data, X.indices, X.indptr), - copy=False) - sums = X.sum(axis=0) - - # Count the elements != 0 - mask_non_zeros = sparse.csc_matrix( - (mask_valids.astype(np.float64), - X.indices, - X.indptr), copy=False) - s = mask_non_zeros.sum(axis=0) - n_non_missing = np.add(n_non_missing, s) - - else: - sums = X.sum(axis=0) - n_non_missing = np.diff(X.indptr) + statistics = np.empty(X.shape[1]) - # Ignore the error, columns with a np.nan statistics_ - # are not an error at this point. These columns will - # be removed in transform - with np.errstate(all="ignore"): - return np.ravel(sums) / np.ravel(n_non_missing) + if strategy == "constant": + # for constant strategy, self.statistcs_ is used to store + # fill_value in each column + statistics.fill(fill_value) - # Median + Most frequent + Constant else: - # Remove the missing values, for each column - columns_all = np.hsplit(X.data, X.indptr[1:-1]) - mask_missing_values = _get_mask(X.data, missing_values) - mask_valids = np.hsplit(np.logical_not(mask_missing_values), - X.indptr[1:-1]) - - # astype necessary for bug in numpy.hsplit before v1.9 - columns = [col[mask.astype(bool, copy=False)] - for col, mask in zip(columns_all, mask_valids)] - - # Median - if strategy == "median": - median = np.empty(len(columns)) - for i, column in enumerate(columns): - median[i] = _get_median(column, n_zeros_axis[i]) - - return median - - # Most frequent - elif strategy == "most_frequent": - most_frequent = np.empty(len(columns)) - - for i, column in enumerate(columns): - most_frequent[i] = _most_frequent(column, - 0, - n_zeros_axis[i]) - - return most_frequent - - # Constant - elif strategy == "constant": - return np.full(X.shape[1], fill_value) + for i in range(X.shape[1]): + column = X.data[X.indptr[i]:X.indptr[i + 1]] + mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]] + column = column[~mask_column] + + # combine explicit and implicit zeros + mask_zeros = _get_mask(column, 0) + column = column[~mask_zeros] + n_explicit_zeros = mask_zeros.sum() + n_zeros = n_implicit_zeros[i] + n_explicit_zeros + + if strategy == "mean": + s = column.size + n_zeros + statistics[i] = np.nan if s == 0 else column.sum() / s + + elif strategy == "median": + statistics[i] = _get_median(column, + n_zeros) + + elif strategy == "most_frequent": + statistics[i] = _most_frequent(column, + 0, + n_zeros) + return statistics def _dense_fit(self, X, strategy, missing_values, fill_value): """Fit the transformer on dense data.""" @@ -374,6 +359,8 @@ def _dense_fit(self, X, strategy, missing_values, fill_value): # Constant elif strategy == "constant": + # for constant strategy, self.statistcs_ is used to store + # fill_value in each column return np.full(X.shape[1], fill_value, dtype=X.dtype) def transform(self, X): @@ -381,7 +368,7 @@ def transform(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] + X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. """ check_is_fitted(self, 'statistics_') @@ -412,17 +399,19 @@ def transform(self, X): X = X[:, valid_statistics_indexes] # Do actual imputation - if sparse.issparse(X) and self.missing_values != 0: - mask = _get_mask(X.data, self.missing_values) - indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), - np.diff(X.indptr))[mask] + if sparse.issparse(X): + if self.missing_values == 0: + raise ValueError("Imputation not possible when missing_values " + "== 0 and input is sparse. Provide a dense " + "array instead.") + else: + mask = _get_mask(X.data, self.missing_values) + indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), + np.diff(X.indptr))[mask] - X.data[mask] = valid_statistics[indexes].astype(X.dtype, - copy=False) + X.data[mask] = valid_statistics[indexes].astype(X.dtype, + copy=False) else: - if sparse.issparse(X): - X = X.toarray() - mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=0) values = np.repeat(valid_statistics, n_missing) @@ -433,14 +422,13 @@ def transform(self, X): return X -class ChainedImputer(BaseEstimator, TransformerMixin): - """Chained imputer transformer to impute missing values. +class IterativeImputer(BaseEstimator, TransformerMixin): + """Multivariate imputer that estimates each feature from all the others. - Basic implementation of chained imputer from MICE (Multivariate - Imputations by Chained Equations) package from R. This version assumes all - of the features are Gaussian. + A strategy for imputing missing values by modeling each feature with + missing values as a function of other features in a round-robin fashion. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- @@ -462,24 +450,34 @@ class ChainedImputer(BaseEstimator, TransformerMixin): "random" A random order for each round. - n_imputations : int, optional (default=100) - Number of chained imputation rounds to perform, the results of which - will be used in the final average. - - n_burn_in : int, optional (default=10) - Number of initial imputation rounds to perform the results of which - will not be returned. + n_iter : int, optional (default=10) + Number of imputation rounds to perform before returning the imputations + computed during the final round. A round is a single imputation of each + feature with missing values. - predictor : estimator object, default=BayesianRidge() + predictor : estimator object, default=RidgeCV() or BayesianRidge() The predictor to use at each step of the round-robin imputation. - It must support ``return_std`` in its ``predict`` method. + If ``sample_posterior`` is True, the predictor must support + ``return_std`` in its ``predict`` method. Also, if + ``sample_posterior=True`` the default predictor will be + :class:`sklearn.linear_model.BayesianRidge` and + :class:`sklearn.linear_model.RidgeCV` otherwise. + + sample_posterior : boolean, default=False + Whether to sample from the (Gaussian) predictive posterior of the + fitted predictor for each imputation. Predictor must support + ``return_std`` in its ``predict`` method if set to ``True``. Set to + ``True`` if using ``IterativeImputer`` for multiple imputations. n_nearest_features : int, optional (default=None) Number of other features to use to estimate the missing values of - the each feature column. Nearness between features is measured using + each feature column. Nearness between features is measured using the absolute correlation coefficient between each feature pair (after - initial imputation). Can provide significant speed-up when the number - of features is huge. If ``None``, all features will be used. + initial imputation). To ensure coverage of features throughout the + imputation process, the neighbor features are not necessarily nearest, + but are drawn with probability proportional to correlation for each + imputed target feature. Can provide significant speed-up when the + number of features is huge. If ``None``, all features will be used. initial_strategy : str, optional (default="mean") Which strategy to use to initialize the missing values. Same as the @@ -500,37 +498,43 @@ class ChainedImputer(BaseEstimator, TransformerMixin): or 2. random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by ``np.random``. + The seed of the pseudo random number generator to use. Randomizes + selection of predictor features if n_nearest_features is not None, the + ``imputation_order`` if ``random``, and the sampling from posterior if + ``sample_posterior`` is True. Use an integer for determinism. + See :term:`the Glossary `. Attributes ---------- - initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`' - The imputer used to initialize the missing values. + initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer` + Imputer used to initialize the missing values. imputation_sequence_ : list of tuples Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where ``feat_idx`` is the current feature to be imputed, ``neighbor_feat_idx`` is the array of other features used to impute the current feature, and ``predictor`` is the trained predictor used for - the imputation. + the imputation. Length is ``self.n_features_with_missing_ * n_iter``. + + n_features_with_missing_ : int + Number of features with missing values. + + See also + -------- + SimpleImputer : Univariate imputation of missing values. Notes ----- - The R version of MICE does not have inductive functionality, i.e. first - fitting on ``X_train`` and then transforming any ``X_test`` without - additional fitting. We do this by storing each feature's predictor during - the round-robin ``fit`` phase, and predicting without refitting (in order) - during the ``transform`` phase. + To support imputation in inductive mode we store each feature's predictor + during the ``fit`` phase, and predict without refitting (in order) during + the ``transform`` phase. Features which contain all missing values at ``fit`` are discarded upon ``transform``. - Features with missing values in transform which did not have any missing - values in fit will be imputed with the initial imputation method only. + Features with missing values during ``transform`` which did not have any + missing values during ``fit`` will be imputed with the initial imputation + method only. References ---------- @@ -538,14 +542,19 @@ class ChainedImputer(BaseEstimator, TransformerMixin): Multivariate Imputation by Chained Equations in R". Journal of Statistical Software 45: 1-67. `_ + + .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in + Multivariate Data Suitable for use with an Electronic Computer". + Journal of the Royal Statistical Society 22(2): 302-306. + `_ """ def __init__(self, missing_values=np.nan, imputation_order='ascending', - n_imputations=100, - n_burn_in=10, + n_iter=10, predictor=None, + sample_posterior=False, n_nearest_features=None, initial_strategy="mean", min_value=None, @@ -555,9 +564,9 @@ def __init__(self, self.missing_values = missing_values self.imputation_order = imputation_order - self.n_imputations = n_imputations - self.n_burn_in = n_burn_in + self.n_iter = n_iter self.predictor = predictor + self.sample_posterior = sample_posterior self.n_nearest_features = n_nearest_features self.initial_strategy = initial_strategy self.min_value = min_value @@ -595,7 +604,8 @@ def _impute_one_feature(self, predictor : object The predictor to use at this step of the round-robin imputation. - It must support ``return_std`` in its ``predict`` method. + If ``sample_posterior`` is True, the predictor must support + ``return_std`` in its ``predict`` method. If None, it will be cloned from self._predictor. fit_mode : boolean, default=True @@ -634,17 +644,34 @@ def _impute_one_feature(self, # get posterior samples X_test = safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask) - mus, sigmas = predictor.predict(X_test, return_std=True) - good_sigmas = sigmas > 0 - imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) - imputed_values[~good_sigmas] = mus[~good_sigmas] - imputed_values[good_sigmas] = self.random_state_.normal( - loc=mus[good_sigmas], scale=sigmas[good_sigmas]) - - # clip the values - imputed_values = np.clip(imputed_values, - self._min_value, - self._max_value) + if self.sample_posterior: + mus, sigmas = predictor.predict(X_test, return_std=True) + good_sigmas = sigmas > 0 + imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) + imputed_values[~good_sigmas] = mus[~good_sigmas] + mus = mus[good_sigmas] + sigmas = sigmas[good_sigmas] + a = (self._min_value - mus) / sigmas + b = (self._max_value - mus) / sigmas + + if scipy.__version__ < LooseVersion('0.18'): + # bug with vector-valued `a` in old scipy + imputed_values[good_sigmas] = [ + stats.truncnorm(a=a_, b=b_, + loc=loc_, scale=scale_).rvs( + random_state=self.random_state_) + for a_, b_, loc_, scale_ + in zip(a, b, mus, sigmas)] + else: + truncated_normal = stats.truncnorm(a=a, b=b, + loc=mus, scale=sigmas) + imputed_values[good_sigmas] = truncated_normal.rvs( + random_state=self.random_state_) + else: + imputed_values = predictor.predict(X_test) + imputed_values = np.clip(imputed_values, + self._min_value, + self._max_value) # update the feature X_filled[missing_row_mask, feat_idx] = imputed_values @@ -835,44 +862,51 @@ def fit_transform(self, X, y=None): self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) + if self.n_iter < 0: + raise ValueError( + "'n_iter' should be a positive integer. Got {} instead." + .format(self.n_iter)) + if self.predictor is None: - from .linear_model import BayesianRidge - self._predictor = BayesianRidge() + if self.sample_posterior: + from .linear_model import BayesianRidge + self._predictor = BayesianRidge() + else: + from .linear_model import RidgeCV + # including a very small alpha to approximate OLS + self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1, 1, 10])) else: self._predictor = clone(self.predictor) - self._min_value = np.nan if self.min_value is None else self.min_value - self._max_value = np.nan if self.max_value is None else self.max_value + if hasattr(self._predictor, 'random_state'): + self._predictor.random_state = self.random_state_ - self.initial_imputer_ = None - X, X_filled, mask_missing_values = self._initial_imputation(X) + self._min_value = -np.inf if self.min_value is None else self.min_value + self._max_value = np.inf if self.max_value is None else self.max_value - # edge case: in case the user specifies 0 for n_imputations, - # then there is no need to do burn in and the result should be - # just the initial imputation (before clipping) - if self.n_imputations < 1: - return X_filled + self.initial_imputer_ = None + X, Xt, mask_missing_values = self._initial_imputation(X) - X_filled = np.clip(X_filled, self._min_value, self._max_value) + if self.n_iter == 0: + return Xt # order in which to impute # note this is probably too slow for large feature data (d > 100000) # and a better way would be good. # see: https://goo.gl/KyCNwj and subsequent comments ordered_idx = self._get_ordered_idx(mask_missing_values) + self.n_features_with_missing_ = len(ordered_idx) - abs_corr_mat = self._get_abs_corr_mat(X_filled) + abs_corr_mat = self._get_abs_corr_mat(Xt) # impute data - n_rounds = self.n_burn_in + self.n_imputations - n_samples, n_features = X_filled.shape - Xt = np.zeros((n_samples, n_features), dtype=X.dtype) + n_samples, n_features = Xt.shape self.imputation_sequence_ = [] if self.verbose > 0: - print("[ChainedImputer] Completing matrix with shape %s" + print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) start_t = time() - for i_rnd in range(n_rounds): + for i_rnd in range(self.n_iter): if self.imputation_order == 'random': ordered_idx = self._get_ordered_idx(mask_missing_values) @@ -880,22 +914,19 @@ def fit_transform(self, X, y=None): neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, feat_idx, abs_corr_mat) - X_filled, predictor = self._impute_one_feature( - X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, + Xt, predictor = self._impute_one_feature( + Xt, mask_missing_values, feat_idx, neighbor_feat_idx, predictor=None, fit_mode=True) predictor_triplet = ImputerTriplet(feat_idx, neighbor_feat_idx, predictor) self.imputation_sequence_.append(predictor_triplet) - if i_rnd >= self.n_burn_in: - Xt += X_filled if self.verbose > 0: - print('[ChainedImputer] Ending imputation round ' + print('[IterativeImputer] Ending imputation round ' '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, n_rounds, time() - start_t)) + % (i_rnd + 1, self.n_iter, time() - start_t)) - Xt /= self.n_imputations Xt[~mask_missing_values] = X[~mask_missing_values] return Xt @@ -917,28 +948,20 @@ def transform(self, X): """ check_is_fitted(self, 'initial_imputer_') - X, X_filled, mask_missing_values = self._initial_imputation(X) + X, Xt, mask_missing_values = self._initial_imputation(X) - # edge case: in case the user specifies 0 for n_imputations, - # then there is no need to do burn in and the result should be - # just the initial imputation (before clipping) - if self.n_imputations < 1: - return X_filled + if self.n_iter == 0: + return Xt - X_filled = np.clip(X_filled, self._min_value, self._max_value) - - n_rounds = self.n_burn_in + self.n_imputations - n_imputations = len(self.imputation_sequence_) - imputations_per_round = n_imputations // n_rounds + imputations_per_round = len(self.imputation_sequence_) // self.n_iter i_rnd = 0 - Xt = np.zeros(X.shape, dtype=X.dtype) if self.verbose > 0: - print("[ChainedImputer] Completing matrix with shape %s" + print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) start_t = time() for it, predictor_triplet in enumerate(self.imputation_sequence_): - X_filled, _ = self._impute_one_feature( - X_filled, + Xt, _ = self._impute_one_feature( + Xt, mask_missing_values, predictor_triplet.feat_idx, predictor_triplet.neighbor_feat_idx, @@ -946,15 +969,12 @@ def transform(self, X): fit_mode=False ) if not (it + 1) % imputations_per_round: - if i_rnd >= self.n_burn_in: - Xt += X_filled if self.verbose > 1: - print('[ChainedImputer] Ending imputation round ' + print('[IterativeImputer] Ending imputation round ' '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, n_rounds, time() - start_t)) + % (i_rnd + 1, self.n_iter, time() - start_t)) i_rnd += 1 - Xt /= self.n_imputations Xt[~mask_missing_values] = X[~mask_missing_values] return Xt @@ -981,11 +1001,18 @@ def fit(self, X, y=None): class MissingIndicator(BaseEstimator, TransformerMixin): """Binary indicators for missing values. + Note that this component typically should not not be used in a vanilla + :class:`Pipeline` consisting of transformers and a classifier, but rather + could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`. + + Read more in the :ref:`User Guide `. + Parameters ---------- missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of - `missing_values` will be imputed. + `missing_values` will be indicated (True in the output array), the + other values will be marked as False. features : str, optional Whether the imputer mask should represent all or a subset of @@ -1006,7 +1033,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): error_on_new : boolean, optional If True (default), transform will raise an error when there are features with missing values in transform that have no missing values - in fit This is applicable only when ``features="missing-only"``. + in fit. This is applicable only when ``features="missing-only"``. Attributes ---------- @@ -1026,7 +1053,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): ... [np.nan, 2, 3], ... [2, 4, 0]]) >>> indicator = MissingIndicator() - >>> indicator.fit(X1) + >>> indicator.fit(X1) # doctest: +NORMALIZE_WHITESPACE MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan, sparse='auto') >>> X2_tr = indicator.transform(X2) @@ -1127,7 +1154,7 @@ def fit(self, X, y=None): raise ValueError("'features' has to be either 'missing-only' or " "'all'. Got {} instead.".format(self.features)) - if not ((isinstance(self.sparse, six.string_types) and + if not ((isinstance(self.sparse, str) and self.sparse == "auto") or isinstance(self.sparse, bool)): raise ValueError("'sparse' has to be a boolean or 'auto'. " "Got {!r} instead.".format(self.sparse)) From 6268e27ffe119aee34fe4e8048c13ce13680942f Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 23 Jan 2019 13:15:00 +1100 Subject: [PATCH 8/9] More git management fails --- sklearn/cluster/tests/test_bicluster.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index 40ab9f8961667..dd5e91c18c27e 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -232,25 +232,6 @@ def test_perfect_checkerboard(): (rows, cols)), 1) -def test_compare_svd_methods(): - data = np.array([[-2, -4, 2], - [-2, 1, 2], - [4, 2, 5]]) - - model_rand = SpectralCoclustering(n_clusters=2, - svd_method='randomized', - random_state=0) - model_rand.fit(data) - - model_arpack = SpectralCoclustering(n_clusters=2, - svd_method='arpack', - random_state=0) - model_arpack.fit(data) - - assert_array_equal(model_rand.column_labels_, - model_arpack.column_labels_) - - def test_errors(): data = np.arange(25).reshape((5, 5)) From 5285e833155a41d3cdb0a42ad40b86e2dfa17a76 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 23 Jan 2019 21:43:04 +1100 Subject: [PATCH 9/9] Fixes to doctest --- doc/modules/impute.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 76d170e506c2a..45523d74fe9b8 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -106,16 +106,16 @@ round are returned. >>> import numpy as np >>> from sklearn.impute import IterativeImputer >>> imp = IterativeImputer(n_iter=10, random_state=0) - >>> imp.fit([[1, 2], [3, 6], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE + >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE IterativeImputer(imputation_order='ascending', initial_strategy='mean', max_value=None, min_value=None, missing_values=nan, n_iter=10, n_nearest_features=None, predictor=None, random_state=0, sample_posterior=False, verbose=False) - # the model learns that the second feature is double the first >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] + >>> # the model learns that the second feature is double the first >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] - [ 6. 12.] + [ 6. 12.] [ 3. 6.]] Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline