diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 2d3174a9dcc05..ab0f473be4083 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -656,7 +656,7 @@ Kernels:
    :template: class.rst
 
    impute.SimpleImputer
-   impute.ChainedImputer
+   impute.IterativeImputer
    impute.MissingIndicator
    
 .. _kernel_approximation_ref:
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 268ce1c3ede19..8bb3ad8bf940b 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -24,7 +24,7 @@ One type of imputation algorithm is univariate, which imputes values in the i-th
 feature dimension using only non-missing values in that feature dimension
 (e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation
 algorithms use the entire set of available feature dimensions to estimate the
-missing values (e.g. :class:`impute.ChainedImputer`).
+missing values (e.g. :class:`impute.IterativeImputer`).
 
 
 .. _single_imputer:
@@ -87,37 +87,37 @@ string values or pandas categoricals when using the ``'most_frequent'`` or
      ['a' 'y']
      ['b' 'y']]
 
-.. _chained_imputer:
+.. _iterative_imputer:
 
 
 Multivariate feature imputation
 ===============================
 
-A more sophisticated approach is to use the :class:`ChainedImputer` class, which
-implements the imputation technique from MICE (Multivariate Imputation by
-Chained Equations). MICE models each feature with missing values as a function of
-other features, and uses that estimate for imputation. It does so in a round-robin
-fashion: at each step, a feature column is designated as output `y` and the other
-feature columns are treated as inputs `X`. A regressor is fit on `(X, y)` for known `y`.
-Then, the regressor is used to predict the unknown values of `y`. This is repeated
-for each feature in a chained fashion, and then is done for a number of imputation
-rounds. Here is an example snippet::
+A more sophisticated approach is to use the :class:`IterativeImputer` class,
+which models each feature with missing values as a function of other features,
+and uses that estimate for imputation. It does so in an iterated round-robin
+fashion: at each step, a feature column is designated as output ``y`` and the
+other feature columns are treated as inputs ``X``. A regressor is fit on ``(X,
+y)`` for known ``y``. Then, the regressor is used to predict the missing values
+of ``y``.  This is done for each feature in an iterative fashion, and then is
+repeated for ``n_iter`` imputation rounds. The results of the final imputation
+round are returned.
 
     >>> import numpy as np
-    >>> from sklearn.impute import ChainedImputer
-    >>> imp = ChainedImputer(n_imputations=10, random_state=0)
+    >>> from sklearn.impute import IterativeImputer
+    >>> imp = IterativeImputer(n_iter=10, random_state=0)
     >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
-    ChainedImputer(imputation_order='ascending', initial_strategy='mean',
-            max_value=None, min_value=None, missing_values=nan, n_burn_in=10,
-            n_imputations=10, n_nearest_features=None, predictor=None,
-            random_state=0, verbose=False)
+    IterativeImputer(imputation_order='ascending', initial_strategy='mean',
+             max_value=None, min_value=None, missing_values=nan, n_iter=10,
+             n_nearest_features=None, predictor=None, random_state=0,
+             sample_posterior=False, verbose=False)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]
-     [ 6.  4.]
-     [13.  6.]]
+     [ 6.  3.]
+     [24.  6.]]
 
-Both :class:`SimpleImputer` and :class:`ChainedImputer` can be used in a Pipeline
+Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline
 as a way to build a composite estimator that supports imputation.
 See :ref:`sphx_glr_auto_examples_plot_missing_values.py`.
 
@@ -127,21 +127,40 @@ Multiple vs. Single Imputation
 ==============================
 
 In the statistics community, it is common practice to perform multiple imputations,
-generating, for example, 10 separate imputations for a single feature matrix.
-Each of these 10 imputations is then put through the subsequent analysis pipeline
-(e.g. feature engineering, clustering, regression, classification). The 10 final
-analysis results (e.g. held-out validation error) allow the data scientist to
-obtain understanding of the uncertainty inherent in the missing values. The above
-practice is called multiple imputation. As implemented, the :class:`ChainedImputer`
-class generates a single (averaged) imputation for each missing value because this
-is the most common use case for machine learning applications. However, it can also be used
-for multiple imputations by applying it repeatedly to the same dataset with different
-random seeds with the ``n_imputations`` parameter set to 1.
-
-Note that a call to the ``transform`` method of :class:`ChainedImputer` is not
+generating, for example, ``m`` separate imputations for a single feature matrix.
+Each of these ``m`` imputations is then put through the subsequent analysis pipeline
+(e.g. feature engineering, clustering, regression, classification). The ``m`` final
+analysis results (e.g. held-out validation errors) allow the data scientist
+to obtain understanding of how analytic results may differ as a consequence
+of the inherent uncertainty caused by the missing values. The above practice
+is called multiple imputation.
+
+Our implementation of :class:`IterativeImputer` was inspired by the R MICE
+package (Multivariate Imputation by Chained Equations) [1]_, but differs from
+it by returning a single imputation instead of multiple imputations.  However,
+:class:`IterativeImputer` can also be used for multiple imputations by applying
+it repeatedly to the same dataset with different random seeds when
+``sample_posterior=True``. See [2]_, chapter 4 for more discussion on multiple
+vs. single imputations.
+
+It is still an open problem as to how useful single vs. multiple imputation is in
+the context of prediction and classification when the user is not interested in
+measuring uncertainty due to missing values.
+
+Note that a call to the ``transform`` method of :class:`IterativeImputer` is not
 allowed to change the number of samples. Therefore multiple imputations cannot be
 achieved by a single call to ``transform``.
 
+References
+==========
+
+.. [1] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate
+   Imputation by Chained Equations in R". Journal of Statistical Software 45:
+   1-67.
+
+.. [2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis
+   with Missing Data". John Wiley & Sons, Inc., New York, NY, USA.
+
 .. _missing_indicator:
 
 Marking imputed values
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 2c010e5b1be59..2159e39dc126d 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -43,7 +43,7 @@ Support for Python 3.4 and below has been officially dropped.
 :mod:`sklearn.impute`
 .....................
 
-- |MajorFeature| Added :class:`impute.ChainedImputer`, which is a strategy for
+- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy for
   imputing missing values by modeling each feature with missing values as a
   function of other features in a round-robin fashion. :issue:`8478` by
   :user:`Sergey Feldman <sergeyf>`.
diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py
index 3ab1cfff95576..43d7ddfc497f3 100644
--- a/examples/plot_missing_values.py
+++ b/examples/plot_missing_values.py
@@ -4,11 +4,11 @@
 ====================================================
 
 Missing values can be replaced by the mean, the median or the most frequent
-value using the basic :func:`sklearn.impute.SimpleImputer`.
+value using the basic :class:`sklearn.impute.SimpleImputer`.
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
-Another option is the :func:`sklearn.impute.ChainedImputer`. This uses
+Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
 round-robin linear regression, treating every variable as an output in
 turn. The version implemented assumes Gaussian (output) variables. If your
 features are obviously non-Normal, consider transforming them to look more
@@ -26,7 +26,7 @@
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import make_pipeline, make_union
-from sklearn.impute import SimpleImputer, ChainedImputer, MissingIndicator
+from sklearn.impute import SimpleImputer, IterativeImputer, MissingIndicator
 from sklearn.model_selection import cross_val_score
 
 rng = np.random.RandomState(0)
@@ -73,18 +73,18 @@ def get_results(dataset):
                                          scoring='neg_mean_squared_error',
                                          cv=5)
 
-    # Estimate the score after chained imputation of the missing values
+    # Estimate the score after iterative imputation of the missing values
     estimator = make_pipeline(
-        make_union(ChainedImputer(missing_values=0, random_state=0),
+        make_union(IterativeImputer(missing_values=0, random_state=0),
                    MissingIndicator(missing_values=0)),
         RandomForestRegressor(random_state=0, n_estimators=100))
-    chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
-                                            scoring='neg_mean_squared_error')
+    iterative_impute_scores = cross_val_score(estimator, X_missing, y_missing,
+                                              scoring='neg_mean_squared_error')
 
     return ((full_scores.mean(), full_scores.std()),
             (zero_impute_scores.mean(), zero_impute_scores.std()),
             (mean_impute_scores.mean(), mean_impute_scores.std()),
-            (chained_impute_scores.mean(), chained_impute_scores.std()))
+            (iterative_impute_scores.mean(), iterative_impute_scores.std()))
 
 
 results_diabetes = np.array(get_results(load_diabetes()))
@@ -101,7 +101,7 @@ def get_results(dataset):
 x_labels = ['Full data',
             'Zero imputation',
             'Mean Imputation',
-            'Chained Imputation']
+            'Multivariate Imputation']
 colors = ['r', 'g', 'b', 'orange']
 
 # plot diabetes results
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 89fb33a4f9034..3035040c1179a 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -37,7 +37,7 @@
 __all__ = [
     'MissingIndicator',
     'SimpleImputer',
-    'ChainedImputer',
+    'IterativeImputer',
 ]
 
 
@@ -149,6 +149,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     statistics_ : array of shape (n_features,)
         The imputation fill value for each feature.
 
+    See also
+    --------
+    IterativeImputer : Multivariate imputation of missing values.
+
     Examples
     --------
     >>> import numpy as np
@@ -420,14 +424,13 @@ def transform(self, X):
         return X
 
 
-class ChainedImputer(BaseEstimator, TransformerMixin):
-    """Chained imputer transformer to impute missing values.
+class IterativeImputer(BaseEstimator, TransformerMixin):
+    """Multivariate imputer that estimates each feature from all the others.
 
-    Basic implementation of chained imputer from MICE (Multivariate
-    Imputations by Chained Equations) package from R. This version assumes all
-    of the features are Gaussian.
+    A strategy for imputing missing values by modeling each feature with
+    missing values as a function of other features in a round-robin fashion.
 
-    Read more in the :ref:`User Guide <mice>`.
+    Read more in the :ref:`User Guide <iterative_imputer>`.
 
     Parameters
     ----------
@@ -449,24 +452,34 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         "random"
             A random order for each round.
 
-    n_imputations : int, optional (default=100)
-        Number of chained imputation rounds to perform, the results of which
-        will be used in the final average.
+    n_iter : int, optional (default=10)
+        Number of imputation rounds to perform before returning the imputations
+        computed during the final round. A round is a single imputation of each
+        feature with missing values.
 
-    n_burn_in : int, optional (default=10)
-        Number of initial imputation rounds to perform the results of which
-        will not be returned.
-
-    predictor : estimator object, default=BayesianRidge()
+    predictor : estimator object, default=RidgeCV() or BayesianRidge()
         The predictor to use at each step of the round-robin imputation.
-        It must support ``return_std`` in its ``predict`` method.
+        If ``sample_posterior`` is True, the predictor must support
+        ``return_std`` in its ``predict`` method. Also, if
+        ``sample_posterior=True`` the default predictor will be
+        :class:`sklearn.linear_model.BayesianRidge` and
+        :class:`sklearn.linear_model.RidgeCV` otherwise.
+
+    sample_posterior : boolean, default=False
+        Whether to sample from the (Gaussian) predictive posterior of the
+        fitted predictor for each imputation. Predictor must support
+        ``return_std`` in its ``predict`` method if set to ``True``. Set to
+        ``True`` if using ``IterativeImputer`` for multiple imputations.
 
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
-        the each feature column. Nearness between features is measured using
+        each feature column. Nearness between features is measured using
         the absolute correlation coefficient between each feature pair (after
-        initial imputation). Can provide significant speed-up when the number
-        of features is huge. If ``None``, all features will be used.
+        initial imputation). To ensure coverage of features throughout the
+        imputation process, the neighbor features are not necessarily nearest,
+        but are drawn with probability proportional to correlation for each
+        imputed target feature. Can provide significant speed-up when the
+        number of features is huge. If ``None``, all features will be used.
 
     initial_strategy : str, optional (default="mean")
         Which strategy to use to initialize the missing values. Same as the
@@ -487,37 +500,43 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         or 2.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        The seed of the pseudo random number generator to use when shuffling
-        the data.  If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by ``np.random``.
+        The seed of the pseudo random number generator to use. Randomizes
+        selection of predictor features if n_nearest_features is not None, the
+        ``imputation_order`` if ``random``, and the sampling from posterior if
+        ``sample_posterior`` is True. Use an integer for determinism.
+        See :term:`the Glossary <random_state>`.
 
     Attributes
     ----------
-    initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`'
-        The imputer used to initialize the missing values.
+    initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
+        Imputer used to initialize the missing values.
 
     imputation_sequence_ : list of tuples
         Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where
         ``feat_idx`` is the current feature to be imputed,
         ``neighbor_feat_idx`` is the array of other features used to impute the
         current feature, and ``predictor`` is the trained predictor used for
-        the imputation.
+        the imputation. Length is ``self.n_features_with_missing_ * n_iter``.
+
+    n_features_with_missing_ : int
+        Number of features with missing values.
+
+    See also
+    --------
+    SimpleImputer : Univariate imputation of missing values.
 
     Notes
     -----
-    The R version of MICE does not have inductive functionality, i.e. first
-    fitting on ``X_train`` and then transforming any ``X_test`` without
-    additional fitting. We do this by storing each feature's predictor during
-    the round-robin ``fit`` phase, and predicting without refitting (in order)
-    during the ``transform`` phase.
+    To support imputation in inductive mode we store each feature's predictor
+    during the ``fit`` phase, and predict without refitting (in order) during
+    the ``transform`` phase.
 
     Features which contain all missing values at ``fit`` are discarded upon
     ``transform``.
 
-    Features with missing values in transform which did not have any missing
-    values in fit will be imputed with the initial imputation method only.
+    Features with missing values during ``transform`` which did not have any
+    missing values during ``fit`` will be imputed with the initial imputation
+    method only.
 
     References
     ----------
@@ -525,14 +544,19 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         Multivariate Imputation by Chained Equations in R". Journal of
         Statistical Software 45: 1-67.
         <https://www.jstatsoft.org/article/view/v045i03>`_
+
+    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
+        Multivariate Data Suitable for use with an Electronic Computer".
+        Journal of the Royal Statistical Society 22(2): 302-306.
+        <https://www.jstor.org/stable/2984099>`_
     """
 
     def __init__(self,
                  missing_values=np.nan,
                  imputation_order='ascending',
-                 n_imputations=100,
-                 n_burn_in=10,
+                 n_iter=10,
                  predictor=None,
+                 sample_posterior=False,
                  n_nearest_features=None,
                  initial_strategy="mean",
                  min_value=None,
@@ -542,9 +566,9 @@ def __init__(self,
 
         self.missing_values = missing_values
         self.imputation_order = imputation_order
-        self.n_imputations = n_imputations
-        self.n_burn_in = n_burn_in
+        self.n_iter = n_iter
         self.predictor = predictor
+        self.sample_posterior = sample_posterior
         self.n_nearest_features = n_nearest_features
         self.initial_strategy = initial_strategy
         self.min_value = min_value
@@ -582,7 +606,8 @@ def _impute_one_feature(self,
 
         predictor : object
             The predictor to use at this step of the round-robin imputation.
-            It must support ``return_std`` in its ``predict`` method.
+            If ``sample_posterior`` is True, the predictor must support
+            ``return_std`` in its ``predict`` method.
             If None, it will be cloned from self._predictor.
 
         fit_mode : boolean, default=True
@@ -621,12 +646,15 @@ def _impute_one_feature(self,
         # get posterior samples
         X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
                                missing_row_mask)
-        mus, sigmas = predictor.predict(X_test, return_std=True)
-        good_sigmas = sigmas > 0
-        imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
-        imputed_values[~good_sigmas] = mus[~good_sigmas]
-        imputed_values[good_sigmas] = self.random_state_.normal(
-            loc=mus[good_sigmas], scale=sigmas[good_sigmas])
+        if self.sample_posterior:
+            mus, sigmas = predictor.predict(X_test, return_std=True)
+            good_sigmas = sigmas > 0
+            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+            imputed_values[~good_sigmas] = mus[~good_sigmas]
+            imputed_values[good_sigmas] = self.random_state_.normal(
+                loc=mus[good_sigmas], scale=sigmas[good_sigmas])
+        else:
+            imputed_values = predictor.predict(X_test)
 
         # clip the values
         imputed_values = np.clip(imputed_values,
@@ -822,44 +850,51 @@ def fit_transform(self, X, y=None):
         self.random_state_ = getattr(self, "random_state_",
                                      check_random_state(self.random_state))
 
+        if self.n_iter < 0:
+            raise ValueError(
+                "'n_iter' should be a positive integer. Got {} instead."
+                .format(self.n_iter))
+
         if self.predictor is None:
-            from .linear_model import BayesianRidge
-            self._predictor = BayesianRidge()
+            if self.sample_posterior:
+                from .linear_model import BayesianRidge
+                self._predictor = BayesianRidge()
+            else:
+                from .linear_model import RidgeCV
+                # including a very small alpha to approximate OLS
+                self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1,  1, 10]))
         else:
             self._predictor = clone(self.predictor)
 
+        if hasattr(self._predictor, 'random_state'):
+            self._predictor.random_state = self.random_state_
+
         self._min_value = np.nan if self.min_value is None else self.min_value
         self._max_value = np.nan if self.max_value is None else self.max_value
 
         self.initial_imputer_ = None
-        X, X_filled, mask_missing_values = self._initial_imputation(X)
-
-        # edge case: in case the user specifies 0 for n_imputations,
-        # then there is no need to do burn in and the result should be
-        # just the initial imputation (before clipping)
-        if self.n_imputations < 1:
-            return X_filled
+        X, Xt, mask_missing_values = self._initial_imputation(X)
 
-        X_filled = np.clip(X_filled, self._min_value, self._max_value)
+        if self.n_iter == 0:
+            return Xt
 
         # order in which to impute
         # note this is probably too slow for large feature data (d > 100000)
         # and a better way would be good.
         # see: https://goo.gl/KyCNwj and subsequent comments
         ordered_idx = self._get_ordered_idx(mask_missing_values)
+        self.n_features_with_missing_ = len(ordered_idx)
 
-        abs_corr_mat = self._get_abs_corr_mat(X_filled)
+        abs_corr_mat = self._get_abs_corr_mat(Xt)
 
         # impute data
-        n_rounds = self.n_burn_in + self.n_imputations
-        n_samples, n_features = X_filled.shape
-        Xt = np.zeros((n_samples, n_features), dtype=X.dtype)
+        n_samples, n_features = Xt.shape
         self.imputation_sequence_ = []
         if self.verbose > 0:
-            print("[ChainedImputer] Completing matrix with shape %s"
+            print("[IterativeImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
-        for i_rnd in range(n_rounds):
+        for i_rnd in range(self.n_iter):
             if self.imputation_order == 'random':
                 ordered_idx = self._get_ordered_idx(mask_missing_values)
 
@@ -867,22 +902,19 @@ def fit_transform(self, X, y=None):
                 neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
                                                                 feat_idx,
                                                                 abs_corr_mat)
-                X_filled, predictor = self._impute_one_feature(
-                    X_filled, mask_missing_values, feat_idx, neighbor_feat_idx,
+                Xt, predictor = self._impute_one_feature(
+                    Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
                     predictor=None, fit_mode=True)
                 predictor_triplet = ImputerTriplet(feat_idx,
                                                    neighbor_feat_idx,
                                                    predictor)
                 self.imputation_sequence_.append(predictor_triplet)
 
-            if i_rnd >= self.n_burn_in:
-                Xt += X_filled
             if self.verbose > 0:
-                print('[ChainedImputer] Ending imputation round '
+                print('[IterativeImputer] Ending imputation round '
                       '%d/%d, elapsed time %0.2f'
-                      % (i_rnd + 1, n_rounds, time() - start_t))
+                      % (i_rnd + 1, self.n_iter, time() - start_t))
 
-        Xt /= self.n_imputations
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
@@ -904,28 +936,20 @@ def transform(self, X):
         """
         check_is_fitted(self, 'initial_imputer_')
 
-        X, X_filled, mask_missing_values = self._initial_imputation(X)
-
-        # edge case: in case the user specifies 0 for n_imputations,
-        # then there is no need to do burn in and the result should be
-        # just the initial imputation (before clipping)
-        if self.n_imputations < 1:
-            return X_filled
+        X, Xt, mask_missing_values = self._initial_imputation(X)
 
-        X_filled = np.clip(X_filled, self._min_value, self._max_value)
+        if self.n_iter == 0:
+            return Xt
 
-        n_rounds = self.n_burn_in + self.n_imputations
-        n_imputations = len(self.imputation_sequence_)
-        imputations_per_round = n_imputations // n_rounds
+        imputations_per_round = len(self.imputation_sequence_) // self.n_iter
         i_rnd = 0
-        Xt = np.zeros(X.shape, dtype=X.dtype)
         if self.verbose > 0:
-            print("[ChainedImputer] Completing matrix with shape %s"
+            print("[IterativeImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
         for it, predictor_triplet in enumerate(self.imputation_sequence_):
-            X_filled, _ = self._impute_one_feature(
-                X_filled,
+            Xt, _ = self._impute_one_feature(
+                Xt,
                 mask_missing_values,
                 predictor_triplet.feat_idx,
                 predictor_triplet.neighbor_feat_idx,
@@ -933,15 +957,12 @@ def transform(self, X):
                 fit_mode=False
             )
             if not (it + 1) % imputations_per_round:
-                if i_rnd >= self.n_burn_in:
-                    Xt += X_filled
                 if self.verbose > 1:
-                    print('[ChainedImputer] Ending imputation round '
+                    print('[IterativeImputer] Ending imputation round '
                           '%d/%d, elapsed time %0.2f'
-                          % (i_rnd + 1, n_rounds, time() - start_t))
+                          % (i_rnd + 1, self.n_iter, time() - start_t))
                 i_rnd += 1
 
-        Xt /= self.n_imputations
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index f9c3e4902f145..dd246cc3e8c4d 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -14,9 +14,9 @@
 from sklearn.utils.testing import assert_false
 
 from sklearn.impute import MissingIndicator
-from sklearn.impute import SimpleImputer, ChainedImputer
+from sklearn.impute import SimpleImputer, IterativeImputer
 from sklearn.dummy import DummyRegressor
-from sklearn.linear_model import BayesianRidge, ARDRegression
+from sklearn.linear_model import BayesianRidge, ARDRegression, RidgeCV
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn import tree
@@ -73,8 +73,8 @@ def test_imputation_shape():
         X_imputed = imputer.fit_transform(X)
         assert X_imputed.shape == (10, 2)
 
-        chained_imputer = ChainedImputer(initial_strategy=strategy)
-        X_imputed = chained_imputer.fit_transform(X)
+        iterative_imputer = IterativeImputer(initial_strategy=strategy)
+        X_imputed = iterative_imputer.fit_transform(X)
         assert X_imputed.shape == (10, 2)
 
 
@@ -508,46 +508,31 @@ def test_imputation_copy():
     # made, even if copy=False.
 
 
-def test_chained_imputer_rank_one():
-    rng = np.random.RandomState(0)
-    d = 100
-    A = rng.rand(d, 1)
-    B = rng.rand(1, d)
-    X = np.dot(A, B)
-    nan_mask = rng.rand(d, d) < 0.5
-    X_missing = X.copy()
-    X_missing[nan_mask] = np.nan
-
-    imputer = ChainedImputer(n_imputations=5,
-                             n_burn_in=5,
-                             verbose=True,
-                             random_state=rng)
-    X_filled = imputer.fit_transform(X_missing)
-    assert_allclose(X_filled, X, atol=0.001)
-
-
 @pytest.mark.parametrize(
     "imputation_order",
     ['random', 'roman', 'ascending', 'descending', 'arabic']
 )
-def test_chained_imputer_imputation_order(imputation_order):
+def test_iterative_imputer_imputation_order(imputation_order):
     rng = np.random.RandomState(0)
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
-    X[:, 0] = 1  # this column should not be discarded by ChainedImputer
-
-    imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
-                             n_nearest_features=5,
-                             min_value=0,
-                             max_value=1,
-                             verbose=False,
-                             imputation_order=imputation_order,
-                             random_state=rng)
+    X[:, 0] = 1  # this column should not be discarded by IterativeImputer
+
+    n_iter = 2
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=n_iter,
+                               n_nearest_features=5,
+                               min_value=0,
+                               max_value=1,
+                               verbose=False,
+                               imputation_order=imputation_order,
+                               random_state=rng)
     imputer.fit_transform(X)
     ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
+
+    assert len(ordered_idx) // n_iter == imputer.n_features_with_missing_
+
     if imputation_order == 'roman':
         assert np.all(ordered_idx[:d-1] == np.arange(1, d))
     elif imputation_order == 'arabic':
@@ -557,25 +542,24 @@ def test_chained_imputer_imputation_order(imputation_order):
         ordered_idx_round_2 = ordered_idx[d-1:]
         assert ordered_idx_round_1 != ordered_idx_round_2
     elif 'ending' in imputation_order:
-        assert len(ordered_idx) == 2 * (d - 1)
+        assert len(ordered_idx) == n_iter * (d - 1)
 
 
 @pytest.mark.parametrize(
     "predictor",
-    [DummyRegressor(), BayesianRidge(), ARDRegression()]
+    [DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()]
 )
-def test_chained_imputer_predictors(predictor):
+def test_iterative_imputer_predictors(predictor):
     rng = np.random.RandomState(0)
 
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
-                             predictor=predictor,
-                             random_state=rng)
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               predictor=predictor,
+                               random_state=rng)
     imputer.fit_transform(X)
 
     # check that types are correct for predictors
@@ -588,19 +572,18 @@ def test_chained_imputer_predictors(predictor):
     assert len(set(hashes)) == len(hashes)
 
 
-def test_chained_imputer_clip():
+def test_iterative_imputer_clip():
     rng = np.random.RandomState(0)
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10,
                              random_state=rng).toarray()
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
-                             min_value=0.1,
-                             max_value=0.2,
-                             random_state=rng)
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               min_value=0.1,
+                               max_value=0.2,
+                               random_state=rng)
 
     Xt = imputer.fit_transform(X)
     assert_allclose(np.min(Xt[X == 0]), 0.1)
@@ -612,7 +595,7 @@ def test_chained_imputer_clip():
     "strategy",
     ["mean", "median", "most_frequent"]
 )
-def test_chained_imputer_missing_at_transform(strategy):
+def test_iterative_imputer_missing_at_transform(strategy):
     rng = np.random.RandomState(0)
     n = 100
     d = 10
@@ -622,11 +605,10 @@ def test_chained_imputer_missing_at_transform(strategy):
     X_train[:, 0] = 1  # definitely no missing values in 0th column
     X_test[0, 0] = 0  # definitely missing value in 0th column
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
-                             initial_strategy=strategy,
-                             random_state=rng).fit(X_train)
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               initial_strategy=strategy,
+                               random_state=rng).fit(X_train)
     initial_imputer = SimpleImputer(missing_values=0,
                                     strategy=strategy).fit(X_train)
 
@@ -636,17 +618,19 @@ def test_chained_imputer_missing_at_transform(strategy):
                   initial_imputer.transform(X_test)[:, 0])
 
 
-def test_chained_imputer_transform_stochasticity():
-    rng = np.random.RandomState(0)
+def test_iterative_imputer_transform_stochasticity():
+    rng1 = np.random.RandomState(0)
+    rng2 = np.random.RandomState(1)
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10,
-                             random_state=rng).toarray()
+                             random_state=rng1).toarray()
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
-                             random_state=rng)
+    # when sample_posterior=True, two transforms shouldn't be equal
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               sample_posterior=True,
+                               random_state=rng1)
     imputer.fit(X)
 
     X_fitted_1 = imputer.transform(X)
@@ -655,13 +639,39 @@ def test_chained_imputer_transform_stochasticity():
     # sufficient to assert that the means are not the same
     assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
 
-
-def test_chained_imputer_no_missing():
+    # when sample_posterior=False, and n_nearest_features=None
+    # and imputation_order is not random
+    # the two transforms should be identical even if rng are different
+    imputer1 = IterativeImputer(missing_values=0,
+                                n_iter=1,
+                                sample_posterior=False,
+                                n_nearest_features=None,
+                                imputation_order='ascending',
+                                random_state=rng1)
+
+    imputer2 = IterativeImputer(missing_values=0,
+                                n_iter=1,
+                                sample_posterior=False,
+                                n_nearest_features=None,
+                                imputation_order='ascending',
+                                random_state=rng2)
+    imputer1.fit(X)
+    imputer2.fit(X)
+
+    X_fitted_1a = imputer1.transform(X)
+    X_fitted_1b = imputer1.transform(X)
+    X_fitted_2 = imputer2.transform(X)
+
+    assert np.all(X_fitted_1a == X_fitted_1b)
+    assert np.all(X_fitted_1a == X_fitted_2)
+
+
+def test_iterative_imputer_no_missing():
     rng = np.random.RandomState(0)
     X = rng.rand(100, 100)
     X[:, 0] = np.nan
-    m1 = ChainedImputer(n_imputations=10, random_state=rng)
-    m2 = ChainedImputer(n_imputations=10, random_state=rng)
+    m1 = IterativeImputer(n_iter=10, random_state=rng)
+    m2 = IterativeImputer(n_iter=10, random_state=rng)
     pred1 = m1.fit(X).transform(X)
     pred2 = m2.fit_transform(X)
     # should exclude the first column entirely
@@ -670,11 +680,28 @@ def test_chained_imputer_no_missing():
     assert_allclose(pred1, pred2)
 
 
+def test_iterative_imputer_rank_one():
+    rng = np.random.RandomState(0)
+    d = 100
+    A = rng.rand(d, 1)
+    B = rng.rand(1, d)
+    X = np.dot(A, B)
+    nan_mask = rng.rand(d, d) < 0.5
+    X_missing = X.copy()
+    X_missing[nan_mask] = np.nan
+
+    imputer = IterativeImputer(n_iter=5,
+                               verbose=1,
+                               random_state=rng)
+    X_filled = imputer.fit_transform(X_missing)
+    assert_allclose(X_filled, X, atol=0.01)
+
+
 @pytest.mark.parametrize(
     "rank",
     [3, 5]
 )
-def test_chained_imputer_transform_recovery(rank):
+def test_iterative_imputer_transform_recovery(rank):
     rng = np.random.RandomState(0)
     n = 100
     d = 100
@@ -692,15 +719,14 @@ def test_chained_imputer_transform_recovery(rank):
     X_test_filled = X_filled[n:]
     X_test = X_missing[n:]
 
-    imputer = ChainedImputer(n_imputations=10,
-                             n_burn_in=10,
-                             verbose=True,
-                             random_state=rng).fit(X_train)
+    imputer = IterativeImputer(n_iter=10,
+                               verbose=1,
+                               random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
-    assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
+    assert_allclose(X_test_filled, X_test_est, atol=0.1)
 
 
-def test_chained_imputer_additive_matrix():
+def test_iterative_imputer_additive_matrix():
     rng = np.random.RandomState(0)
     n = 100
     d = 10
@@ -721,14 +747,20 @@ def test_chained_imputer_additive_matrix():
     X_test_filled = X_filled[n:]
     X_test = X_missing[n:]
 
-    imputer = ChainedImputer(n_imputations=25,
-                             n_burn_in=10,
-                             verbose=True,
-                             random_state=rng).fit(X_train)
+    imputer = IterativeImputer(n_iter=10,
+                               verbose=2,
+                               random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
     assert_allclose(X_test_filled, X_test_est, atol=0.01)
 
 
+def test_iterative_imputer_error_param():
+    rng = np.random.RandomState(42)
+    X = rng.randn(100, 2)
+    imputer = IterativeImputer(n_iter=-1)
+    with pytest.raises(ValueError, match='should be a positive integer'):
+        imputer.fit_transform(X)
+
 @pytest.mark.parametrize(
     "X_fit, X_trans, params, msg_err",
     [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]),
@@ -845,7 +877,7 @@ def test_missing_indicator_sparse_param(arr_type, missing_values,
 
 
 @pytest.mark.parametrize("imputer_constructor",
-                         [SimpleImputer, ChainedImputer])
+                         [SimpleImputer, IterativeImputer])
 @pytest.mark.parametrize(
     "imputer_missing_values, missing_value, err_msg",
     [("NaN", np.nan, "Input contains NaN"),
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 931e50d920402..a59c1b8cd6e6b 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -77,7 +77,8 @@
                 'RANSACRegressor', 'RadiusNeighborsRegressor',
                 'RandomForestRegressor', 'Ridge', 'RidgeCV']
 
-ALLOW_NAN = ['Imputer', 'SimpleImputer', 'ChainedImputer', 'MissingIndicator',
+ALLOW_NAN = ['Imputer', 'SimpleImputer', 'IterativeImputer',
+             'MissingIndicator',
              'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
              'PowerTransformer', 'QuantileTransformer']