diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2d3174a9dcc05..ab0f473be4083 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -656,7 +656,7 @@ Kernels: :template: class.rst impute.SimpleImputer - impute.ChainedImputer + impute.IterativeImputer impute.MissingIndicator .. _kernel_approximation_ref: diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 268ce1c3ede19..8bb3ad8bf940b 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -24,7 +24,7 @@ One type of imputation algorithm is univariate, which imputes values in the i-th feature dimension using only non-missing values in that feature dimension (e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation algorithms use the entire set of available feature dimensions to estimate the -missing values (e.g. :class:`impute.ChainedImputer`). +missing values (e.g. :class:`impute.IterativeImputer`). .. _single_imputer: @@ -87,37 +87,37 @@ string values or pandas categoricals when using the ``'most_frequent'`` or ['a' 'y'] ['b' 'y']] -.. _chained_imputer: +.. _iterative_imputer: Multivariate feature imputation =============================== -A more sophisticated approach is to use the :class:`ChainedImputer` class, which -implements the imputation technique from MICE (Multivariate Imputation by -Chained Equations). MICE models each feature with missing values as a function of -other features, and uses that estimate for imputation. It does so in a round-robin -fashion: at each step, a feature column is designated as output `y` and the other -feature columns are treated as inputs `X`. A regressor is fit on `(X, y)` for known `y`. -Then, the regressor is used to predict the unknown values of `y`. This is repeated -for each feature in a chained fashion, and then is done for a number of imputation -rounds. Here is an example snippet:: +A more sophisticated approach is to use the :class:`IterativeImputer` class, +which models each feature with missing values as a function of other features, +and uses that estimate for imputation. It does so in an iterated round-robin +fashion: at each step, a feature column is designated as output ``y`` and the +other feature columns are treated as inputs ``X``. A regressor is fit on ``(X, +y)`` for known ``y``. Then, the regressor is used to predict the missing values +of ``y``. This is done for each feature in an iterative fashion, and then is +repeated for ``n_iter`` imputation rounds. The results of the final imputation +round are returned. >>> import numpy as np - >>> from sklearn.impute import ChainedImputer - >>> imp = ChainedImputer(n_imputations=10, random_state=0) + >>> from sklearn.impute import IterativeImputer + >>> imp = IterativeImputer(n_iter=10, random_state=0) >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]]) - ChainedImputer(imputation_order='ascending', initial_strategy='mean', - max_value=None, min_value=None, missing_values=nan, n_burn_in=10, - n_imputations=10, n_nearest_features=None, predictor=None, - random_state=0, verbose=False) + IterativeImputer(imputation_order='ascending', initial_strategy='mean', + max_value=None, min_value=None, missing_values=nan, n_iter=10, + n_nearest_features=None, predictor=None, random_state=0, + sample_posterior=False, verbose=False) >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] - [ 6. 4.] - [13. 6.]] + [ 6. 3.] + [24. 6.]] -Both :class:`SimpleImputer` and :class:`ChainedImputer` can be used in a Pipeline +Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. @@ -127,21 +127,40 @@ Multiple vs. Single Imputation ============================== In the statistics community, it is common practice to perform multiple imputations, -generating, for example, 10 separate imputations for a single feature matrix. -Each of these 10 imputations is then put through the subsequent analysis pipeline -(e.g. feature engineering, clustering, regression, classification). The 10 final -analysis results (e.g. held-out validation error) allow the data scientist to -obtain understanding of the uncertainty inherent in the missing values. The above -practice is called multiple imputation. As implemented, the :class:`ChainedImputer` -class generates a single (averaged) imputation for each missing value because this -is the most common use case for machine learning applications. However, it can also be used -for multiple imputations by applying it repeatedly to the same dataset with different -random seeds with the ``n_imputations`` parameter set to 1. - -Note that a call to the ``transform`` method of :class:`ChainedImputer` is not +generating, for example, ``m`` separate imputations for a single feature matrix. +Each of these ``m`` imputations is then put through the subsequent analysis pipeline +(e.g. feature engineering, clustering, regression, classification). The ``m`` final +analysis results (e.g. held-out validation errors) allow the data scientist +to obtain understanding of how analytic results may differ as a consequence +of the inherent uncertainty caused by the missing values. The above practice +is called multiple imputation. + +Our implementation of :class:`IterativeImputer` was inspired by the R MICE +package (Multivariate Imputation by Chained Equations) [1]_, but differs from +it by returning a single imputation instead of multiple imputations. However, +:class:`IterativeImputer` can also be used for multiple imputations by applying +it repeatedly to the same dataset with different random seeds when +``sample_posterior=True``. See [2]_, chapter 4 for more discussion on multiple +vs. single imputations. + +It is still an open problem as to how useful single vs. multiple imputation is in +the context of prediction and classification when the user is not interested in +measuring uncertainty due to missing values. + +Note that a call to the ``transform`` method of :class:`IterativeImputer` is not allowed to change the number of samples. Therefore multiple imputations cannot be achieved by a single call to ``transform``. +References +========== + +.. [1] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate + Imputation by Chained Equations in R". Journal of Statistical Software 45: + 1-67. + +.. [2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis + with Missing Data". John Wiley & Sons, Inc., New York, NY, USA. + .. _missing_indicator: Marking imputed values diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 2c010e5b1be59..2159e39dc126d 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -43,7 +43,7 @@ Support for Python 3.4 and below has been officially dropped. :mod:`sklearn.impute` ..................... -- |MajorFeature| Added :class:`impute.ChainedImputer`, which is a strategy for +- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion. :issue:`8478` by :user:`Sergey Feldman `. diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 3ab1cfff95576..43d7ddfc497f3 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -4,11 +4,11 @@ ==================================================== Missing values can be replaced by the mean, the median or the most frequent -value using the basic :func:`sklearn.impute.SimpleImputer`. +value using the basic :class:`sklearn.impute.SimpleImputer`. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). -Another option is the :func:`sklearn.impute.ChainedImputer`. This uses +Another option is the :class:`sklearn.impute.IterativeImputer`. This uses round-robin linear regression, treating every variable as an output in turn. The version implemented assumes Gaussian (output) variables. If your features are obviously non-Normal, consider transforming them to look more @@ -26,7 +26,7 @@ from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import make_pipeline, make_union -from sklearn.impute import SimpleImputer, ChainedImputer, MissingIndicator +from sklearn.impute import SimpleImputer, IterativeImputer, MissingIndicator from sklearn.model_selection import cross_val_score rng = np.random.RandomState(0) @@ -73,18 +73,18 @@ def get_results(dataset): scoring='neg_mean_squared_error', cv=5) - # Estimate the score after chained imputation of the missing values + # Estimate the score after iterative imputation of the missing values estimator = make_pipeline( - make_union(ChainedImputer(missing_values=0, random_state=0), + make_union(IterativeImputer(missing_values=0, random_state=0), MissingIndicator(missing_values=0)), RandomForestRegressor(random_state=0, n_estimators=100)) - chained_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error') + iterative_impute_scores = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error') return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), - (chained_impute_scores.mean(), chained_impute_scores.std())) + (iterative_impute_scores.mean(), iterative_impute_scores.std())) results_diabetes = np.array(get_results(load_diabetes())) @@ -101,7 +101,7 @@ def get_results(dataset): x_labels = ['Full data', 'Zero imputation', 'Mean Imputation', - 'Chained Imputation'] + 'Multivariate Imputation'] colors = ['r', 'g', 'b', 'orange'] # plot diabetes results diff --git a/sklearn/impute.py b/sklearn/impute.py index 89fb33a4f9034..3035040c1179a 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -37,7 +37,7 @@ __all__ = [ 'MissingIndicator', 'SimpleImputer', - 'ChainedImputer', + 'IterativeImputer', ] @@ -149,6 +149,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin): statistics_ : array of shape (n_features,) The imputation fill value for each feature. + See also + -------- + IterativeImputer : Multivariate imputation of missing values. + Examples -------- >>> import numpy as np @@ -420,14 +424,13 @@ def transform(self, X): return X -class ChainedImputer(BaseEstimator, TransformerMixin): - """Chained imputer transformer to impute missing values. +class IterativeImputer(BaseEstimator, TransformerMixin): + """Multivariate imputer that estimates each feature from all the others. - Basic implementation of chained imputer from MICE (Multivariate - Imputations by Chained Equations) package from R. This version assumes all - of the features are Gaussian. + A strategy for imputing missing values by modeling each feature with + missing values as a function of other features in a round-robin fashion. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- @@ -449,24 +452,34 @@ class ChainedImputer(BaseEstimator, TransformerMixin): "random" A random order for each round. - n_imputations : int, optional (default=100) - Number of chained imputation rounds to perform, the results of which - will be used in the final average. + n_iter : int, optional (default=10) + Number of imputation rounds to perform before returning the imputations + computed during the final round. A round is a single imputation of each + feature with missing values. - n_burn_in : int, optional (default=10) - Number of initial imputation rounds to perform the results of which - will not be returned. - - predictor : estimator object, default=BayesianRidge() + predictor : estimator object, default=RidgeCV() or BayesianRidge() The predictor to use at each step of the round-robin imputation. - It must support ``return_std`` in its ``predict`` method. + If ``sample_posterior`` is True, the predictor must support + ``return_std`` in its ``predict`` method. Also, if + ``sample_posterior=True`` the default predictor will be + :class:`sklearn.linear_model.BayesianRidge` and + :class:`sklearn.linear_model.RidgeCV` otherwise. + + sample_posterior : boolean, default=False + Whether to sample from the (Gaussian) predictive posterior of the + fitted predictor for each imputation. Predictor must support + ``return_std`` in its ``predict`` method if set to ``True``. Set to + ``True`` if using ``IterativeImputer`` for multiple imputations. n_nearest_features : int, optional (default=None) Number of other features to use to estimate the missing values of - the each feature column. Nearness between features is measured using + each feature column. Nearness between features is measured using the absolute correlation coefficient between each feature pair (after - initial imputation). Can provide significant speed-up when the number - of features is huge. If ``None``, all features will be used. + initial imputation). To ensure coverage of features throughout the + imputation process, the neighbor features are not necessarily nearest, + but are drawn with probability proportional to correlation for each + imputed target feature. Can provide significant speed-up when the + number of features is huge. If ``None``, all features will be used. initial_strategy : str, optional (default="mean") Which strategy to use to initialize the missing values. Same as the @@ -487,37 +500,43 @@ class ChainedImputer(BaseEstimator, TransformerMixin): or 2. random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by ``np.random``. + The seed of the pseudo random number generator to use. Randomizes + selection of predictor features if n_nearest_features is not None, the + ``imputation_order`` if ``random``, and the sampling from posterior if + ``sample_posterior`` is True. Use an integer for determinism. + See :term:`the Glossary `. Attributes ---------- - initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`' - The imputer used to initialize the missing values. + initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer` + Imputer used to initialize the missing values. imputation_sequence_ : list of tuples Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where ``feat_idx`` is the current feature to be imputed, ``neighbor_feat_idx`` is the array of other features used to impute the current feature, and ``predictor`` is the trained predictor used for - the imputation. + the imputation. Length is ``self.n_features_with_missing_ * n_iter``. + + n_features_with_missing_ : int + Number of features with missing values. + + See also + -------- + SimpleImputer : Univariate imputation of missing values. Notes ----- - The R version of MICE does not have inductive functionality, i.e. first - fitting on ``X_train`` and then transforming any ``X_test`` without - additional fitting. We do this by storing each feature's predictor during - the round-robin ``fit`` phase, and predicting without refitting (in order) - during the ``transform`` phase. + To support imputation in inductive mode we store each feature's predictor + during the ``fit`` phase, and predict without refitting (in order) during + the ``transform`` phase. Features which contain all missing values at ``fit`` are discarded upon ``transform``. - Features with missing values in transform which did not have any missing - values in fit will be imputed with the initial imputation method only. + Features with missing values during ``transform`` which did not have any + missing values during ``fit`` will be imputed with the initial imputation + method only. References ---------- @@ -525,14 +544,19 @@ class ChainedImputer(BaseEstimator, TransformerMixin): Multivariate Imputation by Chained Equations in R". Journal of Statistical Software 45: 1-67. `_ + + .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in + Multivariate Data Suitable for use with an Electronic Computer". + Journal of the Royal Statistical Society 22(2): 302-306. + `_ """ def __init__(self, missing_values=np.nan, imputation_order='ascending', - n_imputations=100, - n_burn_in=10, + n_iter=10, predictor=None, + sample_posterior=False, n_nearest_features=None, initial_strategy="mean", min_value=None, @@ -542,9 +566,9 @@ def __init__(self, self.missing_values = missing_values self.imputation_order = imputation_order - self.n_imputations = n_imputations - self.n_burn_in = n_burn_in + self.n_iter = n_iter self.predictor = predictor + self.sample_posterior = sample_posterior self.n_nearest_features = n_nearest_features self.initial_strategy = initial_strategy self.min_value = min_value @@ -582,7 +606,8 @@ def _impute_one_feature(self, predictor : object The predictor to use at this step of the round-robin imputation. - It must support ``return_std`` in its ``predict`` method. + If ``sample_posterior`` is True, the predictor must support + ``return_std`` in its ``predict`` method. If None, it will be cloned from self._predictor. fit_mode : boolean, default=True @@ -621,12 +646,15 @@ def _impute_one_feature(self, # get posterior samples X_test = safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask) - mus, sigmas = predictor.predict(X_test, return_std=True) - good_sigmas = sigmas > 0 - imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) - imputed_values[~good_sigmas] = mus[~good_sigmas] - imputed_values[good_sigmas] = self.random_state_.normal( - loc=mus[good_sigmas], scale=sigmas[good_sigmas]) + if self.sample_posterior: + mus, sigmas = predictor.predict(X_test, return_std=True) + good_sigmas = sigmas > 0 + imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) + imputed_values[~good_sigmas] = mus[~good_sigmas] + imputed_values[good_sigmas] = self.random_state_.normal( + loc=mus[good_sigmas], scale=sigmas[good_sigmas]) + else: + imputed_values = predictor.predict(X_test) # clip the values imputed_values = np.clip(imputed_values, @@ -822,44 +850,51 @@ def fit_transform(self, X, y=None): self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) + if self.n_iter < 0: + raise ValueError( + "'n_iter' should be a positive integer. Got {} instead." + .format(self.n_iter)) + if self.predictor is None: - from .linear_model import BayesianRidge - self._predictor = BayesianRidge() + if self.sample_posterior: + from .linear_model import BayesianRidge + self._predictor = BayesianRidge() + else: + from .linear_model import RidgeCV + # including a very small alpha to approximate OLS + self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1, 1, 10])) else: self._predictor = clone(self.predictor) + if hasattr(self._predictor, 'random_state'): + self._predictor.random_state = self.random_state_ + self._min_value = np.nan if self.min_value is None else self.min_value self._max_value = np.nan if self.max_value is None else self.max_value self.initial_imputer_ = None - X, X_filled, mask_missing_values = self._initial_imputation(X) - - # edge case: in case the user specifies 0 for n_imputations, - # then there is no need to do burn in and the result should be - # just the initial imputation (before clipping) - if self.n_imputations < 1: - return X_filled + X, Xt, mask_missing_values = self._initial_imputation(X) - X_filled = np.clip(X_filled, self._min_value, self._max_value) + if self.n_iter == 0: + return Xt # order in which to impute # note this is probably too slow for large feature data (d > 100000) # and a better way would be good. # see: https://goo.gl/KyCNwj and subsequent comments ordered_idx = self._get_ordered_idx(mask_missing_values) + self.n_features_with_missing_ = len(ordered_idx) - abs_corr_mat = self._get_abs_corr_mat(X_filled) + abs_corr_mat = self._get_abs_corr_mat(Xt) # impute data - n_rounds = self.n_burn_in + self.n_imputations - n_samples, n_features = X_filled.shape - Xt = np.zeros((n_samples, n_features), dtype=X.dtype) + n_samples, n_features = Xt.shape self.imputation_sequence_ = [] if self.verbose > 0: - print("[ChainedImputer] Completing matrix with shape %s" + print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) start_t = time() - for i_rnd in range(n_rounds): + for i_rnd in range(self.n_iter): if self.imputation_order == 'random': ordered_idx = self._get_ordered_idx(mask_missing_values) @@ -867,22 +902,19 @@ def fit_transform(self, X, y=None): neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, feat_idx, abs_corr_mat) - X_filled, predictor = self._impute_one_feature( - X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, + Xt, predictor = self._impute_one_feature( + Xt, mask_missing_values, feat_idx, neighbor_feat_idx, predictor=None, fit_mode=True) predictor_triplet = ImputerTriplet(feat_idx, neighbor_feat_idx, predictor) self.imputation_sequence_.append(predictor_triplet) - if i_rnd >= self.n_burn_in: - Xt += X_filled if self.verbose > 0: - print('[ChainedImputer] Ending imputation round ' + print('[IterativeImputer] Ending imputation round ' '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, n_rounds, time() - start_t)) + % (i_rnd + 1, self.n_iter, time() - start_t)) - Xt /= self.n_imputations Xt[~mask_missing_values] = X[~mask_missing_values] return Xt @@ -904,28 +936,20 @@ def transform(self, X): """ check_is_fitted(self, 'initial_imputer_') - X, X_filled, mask_missing_values = self._initial_imputation(X) - - # edge case: in case the user specifies 0 for n_imputations, - # then there is no need to do burn in and the result should be - # just the initial imputation (before clipping) - if self.n_imputations < 1: - return X_filled + X, Xt, mask_missing_values = self._initial_imputation(X) - X_filled = np.clip(X_filled, self._min_value, self._max_value) + if self.n_iter == 0: + return Xt - n_rounds = self.n_burn_in + self.n_imputations - n_imputations = len(self.imputation_sequence_) - imputations_per_round = n_imputations // n_rounds + imputations_per_round = len(self.imputation_sequence_) // self.n_iter i_rnd = 0 - Xt = np.zeros(X.shape, dtype=X.dtype) if self.verbose > 0: - print("[ChainedImputer] Completing matrix with shape %s" + print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) start_t = time() for it, predictor_triplet in enumerate(self.imputation_sequence_): - X_filled, _ = self._impute_one_feature( - X_filled, + Xt, _ = self._impute_one_feature( + Xt, mask_missing_values, predictor_triplet.feat_idx, predictor_triplet.neighbor_feat_idx, @@ -933,15 +957,12 @@ def transform(self, X): fit_mode=False ) if not (it + 1) % imputations_per_round: - if i_rnd >= self.n_burn_in: - Xt += X_filled if self.verbose > 1: - print('[ChainedImputer] Ending imputation round ' + print('[IterativeImputer] Ending imputation round ' '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, n_rounds, time() - start_t)) + % (i_rnd + 1, self.n_iter, time() - start_t)) i_rnd += 1 - Xt /= self.n_imputations Xt[~mask_missing_values] = X[~mask_missing_values] return Xt diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index f9c3e4902f145..dd246cc3e8c4d 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -14,9 +14,9 @@ from sklearn.utils.testing import assert_false from sklearn.impute import MissingIndicator -from sklearn.impute import SimpleImputer, ChainedImputer +from sklearn.impute import SimpleImputer, IterativeImputer from sklearn.dummy import DummyRegressor -from sklearn.linear_model import BayesianRidge, ARDRegression +from sklearn.linear_model import BayesianRidge, ARDRegression, RidgeCV from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree @@ -73,8 +73,8 @@ def test_imputation_shape(): X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) - chained_imputer = ChainedImputer(initial_strategy=strategy) - X_imputed = chained_imputer.fit_transform(X) + iterative_imputer = IterativeImputer(initial_strategy=strategy) + X_imputed = iterative_imputer.fit_transform(X) assert X_imputed.shape == (10, 2) @@ -508,46 +508,31 @@ def test_imputation_copy(): # made, even if copy=False. -def test_chained_imputer_rank_one(): - rng = np.random.RandomState(0) - d = 100 - A = rng.rand(d, 1) - B = rng.rand(1, d) - X = np.dot(A, B) - nan_mask = rng.rand(d, d) < 0.5 - X_missing = X.copy() - X_missing[nan_mask] = np.nan - - imputer = ChainedImputer(n_imputations=5, - n_burn_in=5, - verbose=True, - random_state=rng) - X_filled = imputer.fit_transform(X_missing) - assert_allclose(X_filled, X, atol=0.001) - - @pytest.mark.parametrize( "imputation_order", ['random', 'roman', 'ascending', 'descending', 'arabic'] ) -def test_chained_imputer_imputation_order(imputation_order): +def test_iterative_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() - X[:, 0] = 1 # this column should not be discarded by ChainedImputer - - imputer = ChainedImputer(missing_values=0, - n_imputations=1, - n_burn_in=1, - n_nearest_features=5, - min_value=0, - max_value=1, - verbose=False, - imputation_order=imputation_order, - random_state=rng) + X[:, 0] = 1 # this column should not be discarded by IterativeImputer + + n_iter = 2 + imputer = IterativeImputer(missing_values=0, + n_iter=n_iter, + n_nearest_features=5, + min_value=0, + max_value=1, + verbose=False, + imputation_order=imputation_order, + random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] + + assert len(ordered_idx) // n_iter == imputer.n_features_with_missing_ + if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) elif imputation_order == 'arabic': @@ -557,25 +542,24 @@ def test_chained_imputer_imputation_order(imputation_order): ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: - assert len(ordered_idx) == 2 * (d - 1) + assert len(ordered_idx) == n_iter * (d - 1) @pytest.mark.parametrize( "predictor", - [DummyRegressor(), BayesianRidge(), ARDRegression()] + [DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()] ) -def test_chained_imputer_predictors(predictor): +def test_iterative_imputer_predictors(predictor): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() - imputer = ChainedImputer(missing_values=0, - n_imputations=1, - n_burn_in=1, - predictor=predictor, - random_state=rng) + imputer = IterativeImputer(missing_values=0, + n_iter=1, + predictor=predictor, + random_state=rng) imputer.fit_transform(X) # check that types are correct for predictors @@ -588,19 +572,18 @@ def test_chained_imputer_predictors(predictor): assert len(set(hashes)) == len(hashes) -def test_chained_imputer_clip(): +def test_iterative_imputer_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() - imputer = ChainedImputer(missing_values=0, - n_imputations=1, - n_burn_in=1, - min_value=0.1, - max_value=0.2, - random_state=rng) + imputer = IterativeImputer(missing_values=0, + n_iter=1, + min_value=0.1, + max_value=0.2, + random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) @@ -612,7 +595,7 @@ def test_chained_imputer_clip(): "strategy", ["mean", "median", "most_frequent"] ) -def test_chained_imputer_missing_at_transform(strategy): +def test_iterative_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 @@ -622,11 +605,10 @@ def test_chained_imputer_missing_at_transform(strategy): X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column - imputer = ChainedImputer(missing_values=0, - n_imputations=1, - n_burn_in=1, - initial_strategy=strategy, - random_state=rng).fit(X_train) + imputer = IterativeImputer(missing_values=0, + n_iter=1, + initial_strategy=strategy, + random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) @@ -636,17 +618,19 @@ def test_chained_imputer_missing_at_transform(strategy): initial_imputer.transform(X_test)[:, 0]) -def test_chained_imputer_transform_stochasticity(): - rng = np.random.RandomState(0) +def test_iterative_imputer_transform_stochasticity(): + rng1 = np.random.RandomState(0) + rng2 = np.random.RandomState(1) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, - random_state=rng).toarray() + random_state=rng1).toarray() - imputer = ChainedImputer(missing_values=0, - n_imputations=1, - n_burn_in=1, - random_state=rng) + # when sample_posterior=True, two transforms shouldn't be equal + imputer = IterativeImputer(missing_values=0, + n_iter=1, + sample_posterior=True, + random_state=rng1) imputer.fit(X) X_fitted_1 = imputer.transform(X) @@ -655,13 +639,39 @@ def test_chained_imputer_transform_stochasticity(): # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) - -def test_chained_imputer_no_missing(): + # when sample_posterior=False, and n_nearest_features=None + # and imputation_order is not random + # the two transforms should be identical even if rng are different + imputer1 = IterativeImputer(missing_values=0, + n_iter=1, + sample_posterior=False, + n_nearest_features=None, + imputation_order='ascending', + random_state=rng1) + + imputer2 = IterativeImputer(missing_values=0, + n_iter=1, + sample_posterior=False, + n_nearest_features=None, + imputation_order='ascending', + random_state=rng2) + imputer1.fit(X) + imputer2.fit(X) + + X_fitted_1a = imputer1.transform(X) + X_fitted_1b = imputer1.transform(X) + X_fitted_2 = imputer2.transform(X) + + assert np.all(X_fitted_1a == X_fitted_1b) + assert np.all(X_fitted_1a == X_fitted_2) + + +def test_iterative_imputer_no_missing(): rng = np.random.RandomState(0) X = rng.rand(100, 100) X[:, 0] = np.nan - m1 = ChainedImputer(n_imputations=10, random_state=rng) - m2 = ChainedImputer(n_imputations=10, random_state=rng) + m1 = IterativeImputer(n_iter=10, random_state=rng) + m2 = IterativeImputer(n_iter=10, random_state=rng) pred1 = m1.fit(X).transform(X) pred2 = m2.fit_transform(X) # should exclude the first column entirely @@ -670,11 +680,28 @@ def test_chained_imputer_no_missing(): assert_allclose(pred1, pred2) +def test_iterative_imputer_rank_one(): + rng = np.random.RandomState(0) + d = 100 + A = rng.rand(d, 1) + B = rng.rand(1, d) + X = np.dot(A, B) + nan_mask = rng.rand(d, d) < 0.5 + X_missing = X.copy() + X_missing[nan_mask] = np.nan + + imputer = IterativeImputer(n_iter=5, + verbose=1, + random_state=rng) + X_filled = imputer.fit_transform(X_missing) + assert_allclose(X_filled, X, atol=0.01) + + @pytest.mark.parametrize( "rank", [3, 5] ) -def test_chained_imputer_transform_recovery(rank): +def test_iterative_imputer_transform_recovery(rank): rng = np.random.RandomState(0) n = 100 d = 100 @@ -692,15 +719,14 @@ def test_chained_imputer_transform_recovery(rank): X_test_filled = X_filled[n:] X_test = X_missing[n:] - imputer = ChainedImputer(n_imputations=10, - n_burn_in=10, - verbose=True, - random_state=rng).fit(X_train) + imputer = IterativeImputer(n_iter=10, + verbose=1, + random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) - assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1) + assert_allclose(X_test_filled, X_test_est, atol=0.1) -def test_chained_imputer_additive_matrix(): +def test_iterative_imputer_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 @@ -721,14 +747,20 @@ def test_chained_imputer_additive_matrix(): X_test_filled = X_filled[n:] X_test = X_missing[n:] - imputer = ChainedImputer(n_imputations=25, - n_burn_in=10, - verbose=True, - random_state=rng).fit(X_train) + imputer = IterativeImputer(n_iter=10, + verbose=2, + random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, atol=0.01) +def test_iterative_imputer_error_param(): + rng = np.random.RandomState(42) + X = rng.randn(100, 2) + imputer = IterativeImputer(n_iter=-1) + with pytest.raises(ValueError, match='should be a positive integer'): + imputer.fit_transform(X) + @pytest.mark.parametrize( "X_fit, X_trans, params, msg_err", [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]), @@ -845,7 +877,7 @@ def test_missing_indicator_sparse_param(arr_type, missing_values, @pytest.mark.parametrize("imputer_constructor", - [SimpleImputer, ChainedImputer]) + [SimpleImputer, IterativeImputer]) @pytest.mark.parametrize( "imputer_missing_values, missing_value, err_msg", [("NaN", np.nan, "Input contains NaN"), diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 931e50d920402..a59c1b8cd6e6b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -77,7 +77,8 @@ 'RANSACRegressor', 'RadiusNeighborsRegressor', 'RandomForestRegressor', 'Ridge', 'RidgeCV'] -ALLOW_NAN = ['Imputer', 'SimpleImputer', 'ChainedImputer', 'MissingIndicator', +ALLOW_NAN = ['Imputer', 'SimpleImputer', 'IterativeImputer', + 'MissingIndicator', 'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler', 'PowerTransformer', 'QuantileTransformer']