diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 88201ba8f2ad6..e8fc80644c002 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -655,8 +655,9 @@ Kernels:
    :template: class.rst
 
    impute.SimpleImputer
+   impute.IterativeImputer
    impute.MissingIndicator
-
+   
 .. _kernel_approximation_ref:
 
 :mod:`sklearn.kernel_approximation` Kernel Approximation
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 933685f8bfa6f..6de5df8b12729 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -9,12 +9,28 @@ Imputation of missing values
 For various reasons, many real world datasets contain missing values, often
 encoded as blanks, NaNs or other placeholders. Such datasets however are
 incompatible with scikit-learn estimators which assume that all values in an
-array are numerical, and that all have and hold meaning. A basic strategy to use
-incomplete datasets is to discard entire rows and/or columns containing missing
-values. However, this comes at the price of losing data which may be valuable
-(even though incomplete). A better strategy is to impute the missing values,
-i.e., to infer them from the known part of the data. See the :ref:`glossary`
-entry on imputation.
+array are numerical, and that all have and hold meaning. A basic strategy to
+use incomplete datasets is to discard entire rows and/or columns containing
+missing values. However, this comes at the price of losing data which may be
+valuable (even though incomplete). A better strategy is to impute the missing
+values, i.e., to infer them from the known part of the data. See the
+:ref:`glossary` entry on imputation.
+
+
+Univariate vs. Multivariate Imputation
+======================================
+
+One type of imputation algorithm is univariate, which imputes values in the
+i-th feature dimension using only non-missing values in that feature dimension
+(e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation
+algorithms use the entire set of available feature dimensions to estimate the
+missing values (e.g. :class:`impute.IterativeImputer`).
+
+
+.. _single_imputer:
+
+Univariate feature imputation
+=============================
 
 The :class:`SimpleImputer` class provides basic strategies for imputing missing
 values. Missing values can be imputed with a provided constant value, or using
@@ -50,9 +66,9 @@ The :class:`SimpleImputer` class also supports sparse matrices::
      [6. 3.]
      [7. 6.]]
 
-Note that this format is not meant to be used to implicitly store missing values
-in the matrix because it would densify it at transform time. Missing values encoded
-by 0 must be used with dense input.
+Note that this format is not meant to be used to implicitly store missing
+values in the matrix because it would densify it at transform time. Missing
+values encoded by 0 must be used with dense input.
 
 The :class:`SimpleImputer` class also supports categorical data represented as
 string values or pandas categoricals when using the ``'most_frequent'`` or
@@ -71,9 +87,92 @@ string values or pandas categoricals when using the ``'most_frequent'`` or
      ['a' 'y']
      ['b' 'y']]
 
+.. _iterative_imputer:
+
+
+Multivariate feature imputation
+===============================
+
+A more sophisticated approach is to use the :class:`IterativeImputer` class,
+which models each feature with missing values as a function of other features,
+and uses that estimate for imputation. It does so in an iterated round-robin
+fashion: at each step, a feature column is designated as output ``y`` and the
+other feature columns are treated as inputs ``X``. A regressor is fit on ``(X,
+y)`` for known ``y``. Then, the regressor is used to predict the missing values
+of ``y``.  This is done for each feature in an iterative fashion, and then is
+repeated for ``max_iter`` imputation rounds. The results of the final
+imputation round are returned.
+
+    >>> import numpy as np
+    >>> from sklearn.impute import IterativeImputer
+    >>> imp = IterativeImputer(max_iter=10, random_state=0)
+    >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
+    IterativeImputer(estimator=None, imputation_order='ascending',
+                     initial_strategy='mean', max_iter=10, max_value=None,
+                     min_value=None, missing_values=nan, n_nearest_features=None,
+                     random_state=0, sample_posterior=False, tol=0.001, verbose=0)
+    >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
+    >>> # the model learns that the second feature is double the first
+    >>> print(np.round(imp.transform(X_test)))
+    [[ 1.  2.]
+     [ 6. 12.]
+     [ 3.  6.]]
+
+Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a
+Pipeline as a way to build a composite estimator that supports imputation.
+See :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`.
+
+Flexibility of IterativeImputer
+-------------------------------
+
+There are many well-established imputation packages in the R data science
+ecosystem: Amelia, mi, mice, missForest, etc. missForest is popular, and turns
+out to be a particular instance of different sequential imputation algorithms
+that can all be implemented with :class:`IterativeImputer` by passing in
+different regressors to be used for predicting missing feature values. In the
+case of missForest, this regressor is a Random Forest.
+See :ref:`sphx_glr_auto_examples_plot_iterative_imputer_variants_comparison.py`.
+
+
+.. _multiple_imputation:
+
+Multiple vs. Single Imputation
+------------------------------
+
+In the statistics community, it is common practice to perform multiple
+imputations, generating, for example, ``m`` separate imputations for a single
+feature matrix. Each of these ``m`` imputations is then put through the
+subsequent analysis pipeline (e.g. feature engineering, clustering, regression,
+classification). The ``m`` final analysis results (e.g. held-out validation
+errors) allow the data scientist to obtain understanding of how analytic
+results may differ as a consequence of the inherent uncertainty caused by the
+missing values. The above practice is called multiple imputation.
+
+Our implementation of :class:`IterativeImputer` was inspired by the R MICE
+package (Multivariate Imputation by Chained Equations) [1]_, but differs from
+it by returning a single imputation instead of multiple imputations.  However,
+:class:`IterativeImputer` can also be used for multiple imputations by applying
+it repeatedly to the same dataset with different random seeds when
+``sample_posterior=True``. See [2]_, chapter 4 for more discussion on multiple
+vs. single imputations.
+
+It is still an open problem as to how useful single vs. multiple imputation is
+in the context of prediction and classification when the user is not
+interested in measuring uncertainty due to missing values.
+
+Note that a call to the ``transform`` method of :class:`IterativeImputer` is
+not allowed to change the number of samples. Therefore multiple imputations
+cannot be achieved by a single call to ``transform``.
+
+References
+==========
+
+.. [1] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate
+   Imputation by Chained Equations in R". Journal of Statistical Software 45:
+   1-67.
 
-:class:`SimpleImputer` can be used in a Pipeline as a way to build a composite
-estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`.
+.. [2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis
+   with Missing Data". John Wiley & Sons, Inc., New York, NY, USA.
 
 .. _missing_indicator:
 
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 3bbd9f3189bd4..4d5abf0083753 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -115,6 +115,15 @@ Support for Python 3.4 and below has been officially dropped.
 - |API| Deprecated :mod:`externals.six` since we have dropped support for
   Python 2.7. :issue:`12916` by :user:`Hanmin Qin <qinhanmin2014>`.
 
+:mod:`sklearn.impute`
+.....................
+
+- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy
+  for imputing missing values by modeling each feature with missing values as a
+  function of other features in a round-robin fashion. :issue:`8478` and
+  :issue:`12177` by :user:`Sergey Feldman <sergeyf>` :user:`Ben Lawson
+  <benlawson>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/examples/impute/README.txt b/examples/impute/README.txt
new file mode 100644
index 0000000000000..e42264caf9087
--- /dev/null
+++ b/examples/impute/README.txt
@@ -0,0 +1,6 @@
+.. _impute_examples:
+
+Missing Value Imputation
+------------------------
+
+Examples concerning the :mod:`sklearn.impute` module.
diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
new file mode 100644
index 0000000000000..77a12e87a1e8a
--- /dev/null
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -0,0 +1,126 @@
+"""
+=========================================================
+Imputing missing values with variants of IterativeImputer
+=========================================================
+
+The :class:`sklearn.impute.IterativeImputer` class is very flexible - it can be
+used with a variety of estimators to do round-robin regression, treating every
+variable as an output in turn.
+
+In this example we compare some estimators for the purpose of missing feature
+imputation with :class:`sklearn.imputeIterativeImputer`::
+
+    :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression
+    :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression
+    :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R
+    :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN
+    imputation approaches
+
+Of particular interest is the ability of
+:class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a
+popular imputation package for R. In this example, we have chosen to use
+:class:`sklearn.ensemble.ExtraTreesRegressor` instead of
+:class:`sklearn.ensemble.RandomForestRegressor` (as in missForest) due to its
+increased speed.
+
+Note that :class:`sklearn.neighbors.KNeighborsRegressor` is different from KNN
+imputation, which learns from samples with missing values by using a distance
+metric that accounts for missing values, rather than imputing them.
+
+The goal is to compare different estimators to see which one is best for the
+:class:`sklearn.impute.IterativeImputer` when using a
+:class:`sklearn.linear_model.BayesianRidge` estimator on the California housing
+dataset with a single value randomly removed from each row.
+
+For this particular pattern of missing values we see that
+:class:`sklearn.ensemble.ExtraTreesRegressor` and
+:class:`sklearn.linear_model.BayesianRidge` give the best results.
+"""
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.datasets import fetch_california_housing
+from sklearn.impute import SimpleImputer
+from sklearn.impute import IterativeImputer
+from sklearn.linear_model import BayesianRidge
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import ExtraTreesRegressor
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.pipeline import make_pipeline
+from sklearn.model_selection import cross_val_score
+
+N_SPLITS = 5
+
+rng = np.random.RandomState(0)
+
+X_full, y_full = fetch_california_housing(return_X_y=True)
+n_samples, n_features = X_full.shape
+
+# Estimate the score on the entire dataset, with no missing values
+br_estimator = BayesianRidge()
+score_full_data = pd.DataFrame(
+    cross_val_score(
+        br_estimator, X_full, y_full, scoring='neg_mean_squared_error',
+        cv=N_SPLITS
+    ),
+    columns=['Full Data']
+)
+
+# Add a single missing value to each row
+X_missing = X_full.copy()
+y_missing = y_full
+missing_samples = np.arange(n_samples)
+missing_features = rng.choice(n_features, n_samples, replace=True)
+X_missing[missing_samples, missing_features] = np.nan
+
+# Estimate the score after imputation (mean and median strategies)
+score_simple_imputer = pd.DataFrame()
+for strategy in ('mean', 'median'):
+    estimator = make_pipeline(
+        SimpleImputer(missing_values=np.nan, strategy=strategy),
+        br_estimator
+    )
+    score_simple_imputer[strategy] = cross_val_score(
+        estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
+        cv=N_SPLITS
+    )
+
+# Estimate the score after iterative imputation of the missing values
+# with different estimators
+estimators = [
+    BayesianRidge(),
+    DecisionTreeRegressor(max_features='sqrt', random_state=0),
+    ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0),
+    KNeighborsRegressor(n_neighbors=15)
+]
+score_iterative_imputer = pd.DataFrame()
+for estimator in estimators:
+    estimator = make_pipeline(
+        IterativeImputer(random_state=0, estimator=estimator),
+        br_estimator
+    )
+    score_iterative_imputer[estimator.__class__.__name__] = \
+        cross_val_score(
+            estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
+            cv=N_SPLITS
+        )
+
+scores = pd.concat(
+    [score_full_data, score_simple_imputer, score_iterative_imputer],
+    keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1
+)
+
+# plot boston results
+fig, ax = plt.subplots(figsize=(13, 6))
+means = -scores.mean()
+errors = scores.std()
+means.plot.barh(xerr=errors, ax=ax)
+ax.set_title('California Housing Regression with Different Imputation Methods')
+ax.set_xlabel('MSE (smaller is better)')
+ax.set_yticks(np.arange(means.shape[0]))
+ax.set_yticklabels([" w/ ".join(label) for label in means.index.get_values()])
+plt.tight_layout(pad=1)
+plt.show()
diff --git a/examples/plot_missing_values.py b/examples/impute/plot_missing_values.py
similarity index 62%
rename from examples/plot_missing_values.py
rename to examples/impute/plot_missing_values.py
index 755943fb55bda..897b66aad246c 100644
--- a/examples/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -3,21 +3,23 @@
 Imputing missing values before building an estimator
 ====================================================
 
-This example shows that imputing the missing values can give better
-results than discarding the samples containing any missing value.
-Imputing does not always improve the predictions, so please check via
-cross-validation.  Sometimes dropping rows or using marker values is
-more effective.
-
 Missing values can be replaced by the mean, the median or the most frequent
-value using the basic :func:`sklearn.impute.SimpleImputer`.
+value using the basic :class:`sklearn.impute.SimpleImputer`.
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
+Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
+round-robin linear regression, treating every variable as an output in
+turn. The version implemented assumes Gaussian (output) variables. If your
+features are obviously non-Normal, consider transforming them to look more
+Normal so as to potentially improve performance.
+
 In addition of using an imputing method, we can also keep an indication of the
 missing information using :func:`sklearn.impute.MissingIndicator` which might
 carry some information.
 """
+print(__doc__)
+
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -25,11 +27,24 @@
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import make_pipeline, make_union
-from sklearn.impute import SimpleImputer, MissingIndicator
+from sklearn.impute import SimpleImputer, IterativeImputer, MissingIndicator
 from sklearn.model_selection import cross_val_score
 
 rng = np.random.RandomState(0)
 
+N_SPLITS = 5
+REGRESSOR = RandomForestRegressor(random_state=0, n_estimators=100)
+
+
+def get_scores_for_imputer(imputer, X_missing, y_missing):
+    estimator = make_pipeline(
+        make_union(imputer, MissingIndicator(missing_values=0)),
+        REGRESSOR)
+    impute_scores = cross_val_score(estimator, X_missing, y_missing,
+                                    scoring='neg_mean_squared_error',
+                                    cv=N_SPLITS)
+    return impute_scores
+
 
 def get_results(dataset):
     X_full, y_full = dataset.data, dataset.target
@@ -37,9 +52,9 @@ def get_results(dataset):
     n_features = X_full.shape[1]
 
     # Estimate the score on the entire dataset, with no missing values
-    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
-    full_scores = cross_val_score(estimator, X_full, y_full,
-                                  scoring='neg_mean_squared_error', cv=5)
+    full_scores = cross_val_score(REGRESSOR, X_full, y_full,
+                                  scoring='neg_mean_squared_error',
+                                  cv=N_SPLITS)
 
     # Add missing values in 75% of the lines
     missing_rate = 0.75
@@ -50,32 +65,32 @@ def get_results(dataset):
                                          dtype=np.bool)))
     rng.shuffle(missing_samples)
     missing_features = rng.randint(0, n_features, n_missing_samples)
-
-    # Estimate the score after replacing missing values by 0
     X_missing = X_full.copy()
     X_missing[np.where(missing_samples)[0], missing_features] = 0
     y_missing = y_full.copy()
-    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
-    zero_impute_scores = cross_val_score(estimator, X_missing, y_missing,
-                                         scoring='neg_mean_squared_error',
-                                         cv=5)
+
+    # Estimate the score after replacing missing values by 0
+    imputer = SimpleImputer(missing_values=0,
+                            strategy='constant',
+                            fill_value=0)
+    zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
 
     # Estimate the score after imputation (mean strategy) of the missing values
-    X_missing = X_full.copy()
-    X_missing[np.where(missing_samples)[0], missing_features] = 0
-    y_missing = y_full.copy()
-    estimator = make_pipeline(
-        make_union(SimpleImputer(missing_values=0, strategy="mean"),
-                   MissingIndicator(missing_values=0)),
-        RandomForestRegressor(random_state=0, n_estimators=100))
-    mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,
-                                         scoring='neg_mean_squared_error',
-                                         cv=5)
+    imputer = SimpleImputer(missing_values=0, strategy="mean")
+    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
 
+    # Estimate the score after iterative imputation of the missing values
+    imputer = IterativeImputer(missing_values=0,
+                               random_state=0,
+                               n_nearest_features=5)
+    iterative_impute_scores = get_scores_for_imputer(imputer,
+                                                     X_missing,
+                                                     y_missing)
 
     return ((full_scores.mean(), full_scores.std()),
             (zero_impute_scores.mean(), zero_impute_scores.std()),
-            (mean_impute_scores.mean(), mean_impute_scores.std()))
+            (mean_impute_scores.mean(), mean_impute_scores.std()),
+            (iterative_impute_scores.mean(), iterative_impute_scores.std()))
 
 
 results_diabetes = np.array(get_results(load_diabetes()))
@@ -91,7 +106,8 @@ def get_results(dataset):
 
 x_labels = ['Full data',
             'Zero imputation',
-            'Mean Imputation']
+            'Mean Imputation',
+            'Multivariate Imputation']
 colors = ['r', 'g', 'b', 'orange']
 
 # plot diabetes results
diff --git a/sklearn/impute.py b/sklearn/impute.py
index fdafa6ad6e198..3bb0bdd9eff15 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -3,16 +3,25 @@
 #          Sergey Feldman <sergeyfeldman@gmail.com>
 # License: BSD 3 clause
 
+from __future__ import division
+
 import warnings
 import numbers
+from time import time
+from distutils.version import LooseVersion
 
 import numpy as np
 import numpy.ma as ma
+import scipy
 from scipy import sparse
 from scipy import stats
+from collections import namedtuple
 
 from .base import BaseEstimator, TransformerMixin
-from .utils import check_array
+from .base import clone
+from .exceptions import ConvergenceWarning
+from .preprocessing import normalize
+from .utils import check_array, check_random_state, safe_indexing
 from .utils.sparsefuncs import _get_median
 from .utils.validation import check_is_fitted
 from .utils.validation import FLOAT_DTYPES
@@ -20,9 +29,14 @@
 from .utils import is_scalar_nan
 
 
+ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx',
+                                               'neighbor_feat_idx',
+                                               'estimator'])
+
 __all__ = [
     'MissingIndicator',
     'SimpleImputer',
+    'IterativeImputer',
 ]
 
 
@@ -134,6 +148,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     statistics_ : array of shape (n_features,)
         The imputation fill value for each feature.
 
+    See also
+    --------
+    IterativeImputer : Multivariate imputation of missing values.
+
     Examples
     --------
     >>> import numpy as np
@@ -405,6 +423,618 @@ def transform(self, X):
         return X
 
 
+class IterativeImputer(BaseEstimator, TransformerMixin):
+    """Multivariate imputer that estimates each feature from all the others.
+
+    A strategy for imputing missing values by modeling each feature with
+    missing values as a function of other features in a round-robin fashion.
+
+    Read more in the :ref:`User Guide <iterative_imputer>`.
+
+    Parameters
+    ----------
+    estimator : estimator object, default=BayesianRidge()
+        The estimator to use at each step of the round-robin imputation.
+        If ``sample_posterior`` is True, the estimator must support
+        ``return_std`` in its ``predict`` method.
+
+    missing_values : int, np.nan, optional (default=np.nan)
+        The placeholder for the missing values. All occurrences of
+        ``missing_values`` will be imputed.
+
+    sample_posterior : boolean, default=False
+        Whether to sample from the (Gaussian) predictive posterior of the
+        fitted estimator for each imputation. Estimator must support
+        ``return_std`` in its ``predict`` method if set to ``True``. Set to
+        ``True`` if using ``IterativeImputer`` for multiple imputations.
+
+    max_iter : int, optional (default=10)
+        Maximum number of imputation rounds to perform before returning the
+        imputations computed during the final round. A round is a single
+        imputation of each feature with missing values. The stopping criterion
+        is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol,
+        where `X_t` is `X` at iteration `t. Note that early stopping is only
+        applied if ``sample_posterior=False``.
+
+    tol : float, optional (default=1e-3)
+        Tolerance of the stopping condition.
+
+    n_nearest_features : int, optional (default=None)
+        Number of other features to use to estimate the missing values of
+        each feature column. Nearness between features is measured using
+        the absolute correlation coefficient between each feature pair (after
+        initial imputation). To ensure coverage of features throughout the
+        imputation process, the neighbor features are not necessarily nearest,
+        but are drawn with probability proportional to correlation for each
+        imputed target feature. Can provide significant speed-up when the
+        number of features is huge. If ``None``, all features will be used.
+
+    initial_strategy : str, optional (default="mean")
+        Which strategy to use to initialize the missing values. Same as the
+        ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
+        Valid values: {"mean", "median", "most_frequent", or "constant"}.
+
+    imputation_order : str, optional (default="ascending")
+        The order in which the features will be imputed. Possible values:
+
+        "ascending"
+            From features with fewest missing values to most.
+        "descending"
+            From features with most missing values to fewest.
+        "roman"
+            Left to right.
+        "arabic"
+            Right to left.
+        "random"
+            A random order for each round.
+
+    min_value : float, optional (default=None)
+        Minimum possible imputed value. Default of ``None`` will set minimum
+        to negative infinity.
+
+    max_value : float, optional (default=None)
+        Maximum possible imputed value. Default of ``None`` will set maximum
+        to positive infinity.
+
+    verbose : int, optional (default=0)
+        Verbosity flag, controls the debug messages that are issued
+        as functions are evaluated. The higher, the more verbose. Can be 0, 1,
+        or 2.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use. Randomizes
+        selection of estimator features if n_nearest_features is not None, the
+        ``imputation_order`` if ``random``, and the sampling from posterior if
+        ``sample_posterior`` is True. Use an integer for determinism.
+        See :term:`the Glossary <random_state>`.
+
+    Attributes
+    ----------
+    initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
+        Imputer used to initialize the missing values.
+
+    imputation_sequence_ : list of tuples
+        Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where
+        ``feat_idx`` is the current feature to be imputed,
+        ``neighbor_feat_idx`` is the array of other features used to impute the
+        current feature, and ``estimator`` is the trained estimator used for
+        the imputation. Length is ``self.n_features_with_missing_ *
+        self.n_iter_``.
+
+    n_iter_ : int
+        Number of iteration rounds that occurred. Will be less than
+        ``self.max_iter`` if early stopping criterion was reached.
+
+    n_features_with_missing_ : int
+        Number of features with missing values.
+
+    See also
+    --------
+    SimpleImputer : Univariate imputation of missing values.
+
+    Notes
+    -----
+    To support imputation in inductive mode we store each feature's estimator
+    during the ``fit`` phase, and predict without refitting (in order) during
+    the ``transform`` phase.
+
+    Features which contain all missing values at ``fit`` are discarded upon
+    ``transform``.
+
+    Features with missing values during ``transform`` which did not have any
+    missing values during ``fit`` will be imputed with the initial imputation
+    method only.
+
+    References
+    ----------
+    .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
+        Multivariate Imputation by Chained Equations in R". Journal of
+        Statistical Software 45: 1-67.
+        <https://www.jstatsoft.org/article/view/v045i03>`_
+
+    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
+        Multivariate Data Suitable for use with an Electronic Computer".
+        Journal of the Royal Statistical Society 22(2): 302-306.
+        <https://www.jstor.org/stable/2984099>`_
+    """
+
+    def __init__(self,
+                 estimator=None,
+                 missing_values=np.nan,
+                 sample_posterior=False,
+                 max_iter=10,
+                 tol=1e-3,
+                 n_nearest_features=None,
+                 initial_strategy="mean",
+                 imputation_order='ascending',
+                 min_value=None,
+                 max_value=None,
+                 verbose=0,
+                 random_state=None):
+
+        self.estimator = estimator
+        self.missing_values = missing_values
+        self.sample_posterior = sample_posterior
+        self.max_iter = max_iter
+        self.tol = tol
+        self.n_nearest_features = n_nearest_features
+        self.initial_strategy = initial_strategy
+        self.imputation_order = imputation_order
+        self.min_value = min_value
+        self.max_value = max_value
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def _impute_one_feature(self,
+                            X_filled,
+                            mask_missing_values,
+                            feat_idx,
+                            neighbor_feat_idx,
+                            estimator=None,
+                            fit_mode=True):
+        """Impute a single feature from the others provided.
+
+        This function predicts the missing values of one of the features using
+        the current estimates of all the other features. The ``estimator`` must
+        support ``return_std=True`` in its ``predict`` method for this function
+        to work.
+
+        Parameters
+        ----------
+        X_filled : ndarray
+            Input data with the most recent imputations.
+
+        mask_missing_values : ndarray
+            Input data's missing indicator matrix.
+
+        feat_idx : int
+            Index of the feature currently being imputed.
+
+        neighbor_feat_idx : ndarray
+            Indices of the features to be used in imputing ``feat_idx``.
+
+        estimator : object
+            The estimator to use at this step of the round-robin imputation.
+            If ``sample_posterior`` is True, the estimator must support
+            ``return_std`` in its ``predict`` method.
+            If None, it will be cloned from self._estimator.
+
+        fit_mode : boolean, default=True
+            Whether to fit and predict with the estimator or just predict.
+
+        Returns
+        -------
+        X_filled : ndarray
+            Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
+
+        estimator : estimator with sklearn API
+            The fitted estimator used to impute
+            ``X_filled[missing_row_mask, feat_idx]``.
+        """
+
+        # if nothing is missing, just return the default
+        # (should not happen at fit time because feat_ids would be excluded)
+        missing_row_mask = mask_missing_values[:, feat_idx]
+        if not np.any(missing_row_mask):
+            return X_filled, estimator
+
+        if estimator is None and fit_mode is False:
+            raise ValueError("If fit_mode is False, then an already-fitted "
+                             "estimator should be passed in.")
+
+        if estimator is None:
+            estimator = clone(self._estimator)
+
+        if fit_mode:
+            X_train = safe_indexing(X_filled[:, neighbor_feat_idx],
+                                    ~missing_row_mask)
+            y_train = safe_indexing(X_filled[:, feat_idx],
+                                    ~missing_row_mask)
+            estimator.fit(X_train, y_train)
+
+        # get posterior samples
+        X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
+                               missing_row_mask)
+        if self.sample_posterior:
+            mus, sigmas = estimator.predict(X_test, return_std=True)
+            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+            # two types of problems: (1) non-positive sigmas, (2) mus outside
+            # legal range of min_value and max_value (results in inf sample)
+            positive_sigmas = sigmas > 0
+            imputed_values[~positive_sigmas] = mus[~positive_sigmas]
+            mus_too_low = mus < self._min_value
+            imputed_values[mus_too_low] = self._min_value
+            mus_too_high = mus > self._max_value
+            imputed_values[mus_too_high] = self._max_value
+            # the rest can be sampled without statistical issues
+            inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
+            mus = mus[inrange_mask]
+            sigmas = sigmas[inrange_mask]
+            a = (self._min_value - mus) / sigmas
+            b = (self._max_value - mus) / sigmas
+
+            if scipy.__version__ < LooseVersion('0.18'):
+                # bug with vector-valued `a` in old scipy
+                imputed_values[inrange_mask] = [
+                    stats.truncnorm(a=a_, b=b_,
+                                    loc=loc_, scale=scale_).rvs(
+                                        random_state=self.random_state_)
+                    for a_, b_, loc_, scale_
+                    in zip(a, b, mus, sigmas)]
+            else:
+                truncated_normal = stats.truncnorm(a=a, b=b,
+                                                   loc=mus, scale=sigmas)
+                imputed_values[inrange_mask] = truncated_normal.rvs(
+                    random_state=self.random_state_)
+        else:
+            imputed_values = estimator.predict(X_test)
+            imputed_values = np.clip(imputed_values,
+                                     self._min_value,
+                                     self._max_value)
+
+        # update the feature
+        X_filled[missing_row_mask, feat_idx] = imputed_values
+        return X_filled, estimator
+
+    def _get_neighbor_feat_idx(self,
+                               n_features,
+                               feat_idx,
+                               abs_corr_mat):
+        """Get a list of other features to predict ``feat_idx``.
+
+        If self.n_nearest_features is less than or equal to the total
+        number of features, then use a probability proportional to the absolute
+        correlation between ``feat_idx`` and each other feature to randomly
+        choose a subsample of the other features (without replacement).
+
+        Parameters
+        ----------
+        n_features : int
+            Number of features in ``X``.
+
+        feat_idx : int
+            Index of the feature currently being imputed.
+
+        abs_corr_mat : ndarray, shape (n_features, n_features)
+            Absolute correlation matrix of ``X``. The diagonal has been zeroed
+            out and each feature has been normalized to sum to 1. Can be None.
+
+        Returns
+        -------
+        neighbor_feat_idx : array-like
+            The features to use to impute ``feat_idx``.
+        """
+        if (self.n_nearest_features is not None and
+                self.n_nearest_features < n_features):
+            p = abs_corr_mat[:, feat_idx]
+            neighbor_feat_idx = self.random_state_.choice(
+                np.arange(n_features), self.n_nearest_features, replace=False,
+                p=p)
+        else:
+            inds_left = np.arange(feat_idx)
+            inds_right = np.arange(feat_idx + 1, n_features)
+            neighbor_feat_idx = np.concatenate((inds_left, inds_right))
+        return neighbor_feat_idx
+
+    def _get_ordered_idx(self, mask_missing_values):
+        """Decide in what order we will update the features.
+
+        As a homage to the MICE R package, we will have 4 main options of
+        how to order the updates, and use a random order if anything else
+        is specified.
+
+        Also, this function skips features which have no missing values.
+
+        Parameters
+        ----------
+        mask_missing_values : array-like, shape (n_samples, n_features)
+            Input data's missing indicator matrix, where "n_samples" is the
+            number of samples and "n_features" is the number of features.
+
+        Returns
+        -------
+        ordered_idx : ndarray, shape (n_features,)
+            The order in which to impute the features.
+        """
+        frac_of_missing_values = mask_missing_values.mean(axis=0)
+        missing_values_idx = np.nonzero(frac_of_missing_values)[0]
+        if self.imputation_order == 'roman':
+            ordered_idx = missing_values_idx
+        elif self.imputation_order == 'arabic':
+            ordered_idx = missing_values_idx[::-1]
+        elif self.imputation_order == 'ascending':
+            n = len(frac_of_missing_values) - len(missing_values_idx)
+            ordered_idx = np.argsort(frac_of_missing_values,
+                                     kind='mergesort')[n:][::-1]
+        elif self.imputation_order == 'descending':
+            n = len(frac_of_missing_values) - len(missing_values_idx)
+            ordered_idx = np.argsort(frac_of_missing_values,
+                                     kind='mergesort')[n:]
+        elif self.imputation_order == 'random':
+            ordered_idx = missing_values_idx
+            self.random_state_.shuffle(ordered_idx)
+        else:
+            raise ValueError("Got an invalid imputation order: '{0}'. It must "
+                             "be one of the following: 'roman', 'arabic', "
+                             "'ascending', 'descending', or "
+                             "'random'.".format(self.imputation_order))
+        return ordered_idx
+
+    def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
+        """Get absolute correlation matrix between features.
+
+        Parameters
+        ----------
+        X_filled : ndarray, shape (n_samples, n_features)
+            Input data with the most recent imputations.
+
+        tolerance : float, optional (default=1e-6)
+            ``abs_corr_mat`` can have nans, which will be replaced
+            with ``tolerance``.
+
+        Returns
+        -------
+        abs_corr_mat : ndarray, shape (n_features, n_features)
+            Absolute correlation matrix of ``X`` at the beginning of the
+            current round. The diagonal has been zeroed out and each feature's
+            absolute correlations with all others have been normalized to sum
+            to 1.
+        """
+        n_features = X_filled.shape[1]
+        if (self.n_nearest_features is None or
+                self.n_nearest_features >= n_features):
+            return None
+        abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
+        # np.corrcoef is not defined for features with zero std
+        abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
+        # ensures exploration, i.e. at least some probability of sampling
+        np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
+        # features are not their own neighbors
+        np.fill_diagonal(abs_corr_mat, 0)
+        # needs to sum to 1 for np.random.choice sampling
+        abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
+        return abs_corr_mat
+
+    def _initial_imputation(self, X):
+        """Perform initial imputation for input X.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        Returns
+        -------
+        Xt : ndarray, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        X_filled : ndarray, shape (n_samples, n_features)
+            Input data with the most recent imputations.
+
+        mask_missing_values : ndarray, shape (n_samples, n_features)
+            Input data's missing indicator matrix, where "n_samples" is the
+            number of samples and "n_features" is the number of features.
+        """
+        if is_scalar_nan(self.missing_values):
+            force_all_finite = "allow-nan"
+        else:
+            force_all_finite = True
+
+        X = check_array(X, dtype=FLOAT_DTYPES, order="F",
+                        force_all_finite=force_all_finite)
+        _check_inputs_dtype(X, self.missing_values)
+
+        mask_missing_values = _get_mask(X, self.missing_values)
+        if self.initial_imputer_ is None:
+            self.initial_imputer_ = SimpleImputer(
+                                            missing_values=self.missing_values,
+                                            strategy=self.initial_strategy)
+            X_filled = self.initial_imputer_.fit_transform(X)
+        else:
+            X_filled = self.initial_imputer_.transform(X)
+
+        valid_mask = np.flatnonzero(np.logical_not(
+            np.isnan(self.initial_imputer_.statistics_)))
+        Xt = X[:, valid_mask]
+        mask_missing_values = mask_missing_values[:, valid_mask]
+
+        return Xt, X_filled, mask_missing_values
+
+    def fit_transform(self, X, y=None):
+        """Fits the imputer on X and return the transformed X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        y : ignored.
+
+        Returns
+        -------
+        Xt : array-like, shape (n_samples, n_features)
+             The imputed input data.
+        """
+        self.random_state_ = getattr(self, "random_state_",
+                                     check_random_state(self.random_state))
+
+        if self.max_iter < 0:
+            raise ValueError(
+                "'max_iter' should be a positive integer. Got {} instead."
+                .format(self.max_iter))
+
+        if self.tol < 0:
+            raise ValueError(
+                "'tol' should be a non-negative float. Got {} instead."
+                .format(self.tol)
+            )
+
+        if self.estimator is None:
+            from .linear_model import BayesianRidge
+            self._estimator = BayesianRidge()
+        else:
+            self._estimator = clone(self.estimator)
+
+        self.imputation_sequence_ = []
+
+        if hasattr(self._estimator, 'random_state'):
+            self._estimator.random_state = self.random_state_
+
+        self._min_value = -np.inf if self.min_value is None else self.min_value
+        self._max_value = np.inf if self.max_value is None else self.max_value
+
+        self.initial_imputer_ = None
+        X, Xt, mask_missing_values = self._initial_imputation(X)
+
+        if self.max_iter == 0 or np.all(mask_missing_values):
+            self.n_iter_ = 0
+            return Xt
+
+        # order in which to impute
+        # note this is probably too slow for large feature data (d > 100000)
+        # and a better way would be good.
+        # see: https://goo.gl/KyCNwj and subsequent comments
+        ordered_idx = self._get_ordered_idx(mask_missing_values)
+        self.n_features_with_missing_ = len(ordered_idx)
+
+        abs_corr_mat = self._get_abs_corr_mat(Xt)
+
+        n_samples, n_features = Xt.shape
+        if self.verbose > 0:
+            print("[IterativeImputer] Completing matrix with shape %s"
+                  % (X.shape,))
+        start_t = time()
+        if not self.sample_posterior:
+            Xt_previous = Xt.copy()
+            normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
+        for self.n_iter_ in range(1, self.max_iter + 1):
+            if self.imputation_order == 'random':
+                ordered_idx = self._get_ordered_idx(mask_missing_values)
+
+            for feat_idx in ordered_idx:
+                neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
+                                                                feat_idx,
+                                                                abs_corr_mat)
+                Xt, estimator = self._impute_one_feature(
+                    Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
+                    estimator=None, fit_mode=True)
+                estimator_triplet = ImputerTriplet(feat_idx,
+                                                   neighbor_feat_idx,
+                                                   estimator)
+                self.imputation_sequence_.append(estimator_triplet)
+
+            if self.verbose > 1:
+                print('[IterativeImputer] Ending imputation round '
+                      '%d/%d, elapsed time %0.2f'
+                      % (self.n_iter_, self.max_iter, time() - start_t))
+
+            if not self.sample_posterior:
+                inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf,
+                                          axis=None)
+                if inf_norm < normalized_tol:
+                    if self.verbose > 0:
+                        print('[IterativeImputer] Early stopping criterion '
+                              'reached.')
+                    break
+                Xt_previous = Xt.copy()
+        else:
+            if not self.sample_posterior:
+                warnings.warn("[IterativeImputer] Early stopping criterion not"
+                              " reached.", ConvergenceWarning)
+        Xt[~mask_missing_values] = X[~mask_missing_values]
+        return Xt
+
+    def transform(self, X):
+        """Imputes all missing values in X.
+
+        Note that this is stochastic, and that if random_state is not fixed,
+        repeated calls, or permuted input, will yield different results.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            The input data to complete.
+
+        Returns
+        -------
+        Xt : array-like, shape (n_samples, n_features)
+             The imputed input data.
+        """
+        check_is_fitted(self, 'initial_imputer_')
+
+        X, Xt, mask_missing_values = self._initial_imputation(X)
+
+        if self.n_iter_ == 0 or np.all(mask_missing_values):
+            return Xt
+
+        imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
+        i_rnd = 0
+        if self.verbose > 0:
+            print("[IterativeImputer] Completing matrix with shape %s"
+                  % (X.shape,))
+        start_t = time()
+        for it, estimator_triplet in enumerate(self.imputation_sequence_):
+            Xt, _ = self._impute_one_feature(
+                Xt,
+                mask_missing_values,
+                estimator_triplet.feat_idx,
+                estimator_triplet.neighbor_feat_idx,
+                estimator=estimator_triplet.estimator,
+                fit_mode=False
+            )
+            if not (it + 1) % imputations_per_round:
+                if self.verbose > 1:
+                    print('[IterativeImputer] Ending imputation round '
+                          '%d/%d, elapsed time %0.2f'
+                          % (i_rnd + 1, self.n_iter_, time() - start_t))
+                i_rnd += 1
+
+        Xt[~mask_missing_values] = X[~mask_missing_values]
+        return Xt
+
+    def fit(self, X, y=None):
+        """Fits the imputer on X and return self.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        y : ignored
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        self.fit_transform(X)
+        return self
+
+
 class MissingIndicator(BaseEstimator, TransformerMixin):
     """Binary indicators for missing values.
 
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 058cce9a33834..dfa0134d5ab42 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -1,7 +1,10 @@
+from __future__ import division
+
 import pytest
 
 import numpy as np
 from scipy import sparse
+from scipy.stats import kstest
 
 import io
 
@@ -11,7 +14,9 @@
 from sklearn.utils.testing import assert_array_almost_equal
 
 from sklearn.impute import MissingIndicator
-from sklearn.impute import SimpleImputer
+from sklearn.impute import SimpleImputer, IterativeImputer
+from sklearn.dummy import DummyRegressor
+from sklearn.linear_model import BayesianRidge, ARDRegression, RidgeCV
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import make_union
 from sklearn.model_selection import GridSearchCV
@@ -69,6 +74,10 @@ def test_imputation_shape():
         X_imputed = imputer.fit_transform(X)
         assert X_imputed.shape == (10, 2)
 
+        iterative_imputer = IterativeImputer(initial_strategy=strategy)
+        X_imputed = iterative_imputer.fit_transform(X)
+        assert X_imputed.shape == (10, 2)
+
 
 @pytest.mark.parametrize("strategy", ["const", 101, None])
 def test_imputation_error_invalid_strategy(strategy):
@@ -500,6 +509,409 @@ def test_imputation_copy():
     # made, even if copy=False.
 
 
+def test_iterative_imputer_zero_iters():
+    rng = np.random.RandomState(0)
+
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    missing_flag = X == 0
+    X[missing_flag] = np.nan
+
+    imputer = IterativeImputer(max_iter=0)
+    X_imputed = imputer.fit_transform(X)
+    # with max_iter=0, only initial imputation is performed
+    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
+
+    # repeat but force n_iter_ to 0
+    imputer = IterativeImputer(max_iter=5).fit(X)
+    # transformed should not be equal to initial imputation
+    assert not np.all(imputer.transform(X) ==
+                      imputer.initial_imputer_.transform(X))
+
+    imputer.n_iter_ = 0
+    # now they should be equal as only initial imputation is done
+    assert_allclose(imputer.transform(X),
+                    imputer.initial_imputer_.transform(X))
+
+
+def test_iterative_imputer_verbose():
+    rng = np.random.RandomState(0)
+
+    n = 100
+    d = 3
+    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1)
+    imputer.fit(X)
+    imputer.transform(X)
+    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2)
+    imputer.fit(X)
+    imputer.transform(X)
+
+
+def test_iterative_imputer_all_missing():
+    n = 100
+    d = 3
+    X = np.zeros((n, d))
+    imputer = IterativeImputer(missing_values=0, max_iter=1)
+    X_imputed = imputer.fit_transform(X)
+    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
+
+
+@pytest.mark.parametrize(
+    "imputation_order",
+    ['random', 'roman', 'ascending', 'descending', 'arabic']
+)
+def test_iterative_imputer_imputation_order(imputation_order):
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    max_iter = 2
+    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    X[:, 0] = 1  # this column should not be discarded by IterativeImputer
+
+    imputer = IterativeImputer(missing_values=0,
+                               max_iter=max_iter,
+                               n_nearest_features=5,
+                               sample_posterior=False,
+                               min_value=0,
+                               max_value=1,
+                               verbose=1,
+                               imputation_order=imputation_order,
+                               random_state=rng)
+    imputer.fit_transform(X)
+    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
+
+    assert (len(ordered_idx) // imputer.n_iter_ ==
+            imputer.n_features_with_missing_)
+
+    if imputation_order == 'roman':
+        assert np.all(ordered_idx[:d-1] == np.arange(1, d))
+    elif imputation_order == 'arabic':
+        assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1))
+    elif imputation_order == 'random':
+        ordered_idx_round_1 = ordered_idx[:d-1]
+        ordered_idx_round_2 = ordered_idx[d-1:]
+        assert ordered_idx_round_1 != ordered_idx_round_2
+    elif 'ending' in imputation_order:
+        assert len(ordered_idx) == max_iter * (d - 1)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()]
+)
+def test_iterative_imputer_estimators(estimator):
+    rng = np.random.RandomState(0)
+
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+
+    imputer = IterativeImputer(missing_values=0,
+                               max_iter=1,
+                               estimator=estimator,
+                               random_state=rng)
+    imputer.fit_transform(X)
+
+    # check that types are correct for estimators
+    hashes = []
+    for triplet in imputer.imputation_sequence_:
+        expected_type = (type(estimator) if estimator is not None
+                         else type(BayesianRidge()))
+        assert isinstance(triplet.estimator, expected_type)
+        hashes.append(id(triplet.estimator))
+
+    # check that each estimator is unique
+    assert len(set(hashes)) == len(hashes)
+
+
+def test_iterative_imputer_clip():
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10,
+                             random_state=rng).toarray()
+
+    imputer = IterativeImputer(missing_values=0,
+                               max_iter=1,
+                               min_value=0.1,
+                               max_value=0.2,
+                               random_state=rng)
+
+    Xt = imputer.fit_transform(X)
+    assert_allclose(np.min(Xt[X == 0]), 0.1)
+    assert_allclose(np.max(Xt[X == 0]), 0.2)
+    assert_allclose(Xt[X != 0], X[X != 0])
+
+
+def test_iterative_imputer_clip_truncnorm():
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    X[:, 0] = 1
+
+    imputer = IterativeImputer(missing_values=0,
+                               max_iter=2,
+                               n_nearest_features=5,
+                               sample_posterior=True,
+                               min_value=0.1,
+                               max_value=0.2,
+                               verbose=1,
+                               imputation_order='random',
+                               random_state=rng)
+    Xt = imputer.fit_transform(X)
+    assert_allclose(np.min(Xt[X == 0]), 0.1)
+    assert_allclose(np.max(Xt[X == 0]), 0.2)
+    assert_allclose(Xt[X != 0], X[X != 0])
+
+
+def test_iterative_imputer_truncated_normal_posterior():
+    #  test that the values that are imputed using `sample_posterior=True`
+    #  with boundaries (`min_value` and `max_value` are not None) are drawn
+    #  from a distribution that looks gaussian via the Kolmogorov Smirnov test.
+    #  note that starting from the wrong random seed will make this test fail
+    #  because random sampling doesn't occur at all when the imputation
+    #  is outside of the (min_value, max_value) range
+    pytest.importorskip("scipy", minversion="0.17.0")
+    rng = np.random.RandomState(42)
+
+    X = rng.normal(size=(5, 5))
+    X[0][0] = np.nan
+
+    imputer = IterativeImputer(min_value=0,
+                               max_value=0.5,
+                               sample_posterior=True,
+                               random_state=rng)
+
+    imputer.fit_transform(X)
+    # generate multiple imputations for the single missing value
+    imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)])
+
+    assert all(imputations >= 0)
+    assert all(imputations <= 0.5)
+
+    mu, sigma = imputations.mean(), imputations.std()
+    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
+    if sigma == 0:
+        sigma += 1e-12
+    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
+    # we want to fail to reject null hypothesis
+    # null hypothesis: distributions are the same
+    assert ks_statistic < 0.2 or p_value > 0.1, \
+        "The posterior does appear to be normal"
+
+
+@pytest.mark.parametrize(
+    "strategy",
+    ["mean", "median", "most_frequent"]
+)
+def test_iterative_imputer_missing_at_transform(strategy):
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X_train = rng.randint(low=0, high=3, size=(n, d))
+    X_test = rng.randint(low=0, high=3, size=(n, d))
+
+    X_train[:, 0] = 1  # definitely no missing values in 0th column
+    X_test[0, 0] = 0  # definitely missing value in 0th column
+
+    imputer = IterativeImputer(missing_values=0,
+                               max_iter=1,
+                               initial_strategy=strategy,
+                               random_state=rng).fit(X_train)
+    initial_imputer = SimpleImputer(missing_values=0,
+                                    strategy=strategy).fit(X_train)
+
+    # if there were no missing values at time of fit, then imputer will
+    # only use the initial imputer for that feature at transform
+    assert np.all(imputer.transform(X_test)[:, 0] ==
+                  initial_imputer.transform(X_test)[:, 0])
+
+
+def test_iterative_imputer_transform_stochasticity():
+    pytest.importorskip("scipy", minversion="0.17.0")
+    rng1 = np.random.RandomState(0)
+    rng2 = np.random.RandomState(1)
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10,
+                             random_state=rng1).toarray()
+
+    # when sample_posterior=True, two transforms shouldn't be equal
+    imputer = IterativeImputer(missing_values=0,
+                               max_iter=1,
+                               sample_posterior=True,
+                               random_state=rng1)
+    imputer.fit(X)
+
+    X_fitted_1 = imputer.transform(X)
+    X_fitted_2 = imputer.transform(X)
+
+    # sufficient to assert that the means are not the same
+    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
+
+    # when sample_posterior=False, and n_nearest_features=None
+    # and imputation_order is not random
+    # the two transforms should be identical even if rng are different
+    imputer1 = IterativeImputer(missing_values=0,
+                                max_iter=1,
+                                sample_posterior=False,
+                                n_nearest_features=None,
+                                imputation_order='ascending',
+                                random_state=rng1)
+
+    imputer2 = IterativeImputer(missing_values=0,
+                                max_iter=1,
+                                sample_posterior=False,
+                                n_nearest_features=None,
+                                imputation_order='ascending',
+                                random_state=rng2)
+    imputer1.fit(X)
+    imputer2.fit(X)
+
+    X_fitted_1a = imputer1.transform(X)
+    X_fitted_1b = imputer1.transform(X)
+    X_fitted_2 = imputer2.transform(X)
+
+    assert np.all(X_fitted_1a == X_fitted_1b)
+    assert np.all(X_fitted_1a == X_fitted_2)
+
+
+def test_iterative_imputer_no_missing():
+    rng = np.random.RandomState(0)
+    X = rng.rand(100, 100)
+    X[:, 0] = np.nan
+    m1 = IterativeImputer(max_iter=10, random_state=rng)
+    m2 = IterativeImputer(max_iter=10, random_state=rng)
+    pred1 = m1.fit(X).transform(X)
+    pred2 = m2.fit_transform(X)
+    # should exclude the first column entirely
+    assert_allclose(X[:, 1:], pred1)
+    # fit and fit_transform should both be identical
+    assert_allclose(pred1, pred2)
+
+
+def test_iterative_imputer_rank_one():
+    rng = np.random.RandomState(0)
+    d = 100
+    A = rng.rand(d, 1)
+    B = rng.rand(1, d)
+    X = np.dot(A, B)
+    nan_mask = rng.rand(d, d) < 0.5
+    X_missing = X.copy()
+    X_missing[nan_mask] = np.nan
+
+    imputer = IterativeImputer(max_iter=5,
+                               verbose=1,
+                               random_state=rng)
+    X_filled = imputer.fit_transform(X_missing)
+    assert_allclose(X_filled, X, atol=0.01)
+
+
+@pytest.mark.parametrize(
+    "rank",
+    [3, 5]
+)
+def test_iterative_imputer_transform_recovery(rank):
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 100
+    A = rng.rand(n, rank)
+    B = rng.rand(rank, d)
+    X_filled = np.dot(A, B)
+    nan_mask = rng.rand(n, d) < 0.5
+    X_missing = X_filled.copy()
+    X_missing[nan_mask] = np.nan
+
+    # split up data in half
+    n = n // 2
+    X_train = X_missing[:n]
+    X_test_filled = X_filled[n:]
+    X_test = X_missing[n:]
+
+    imputer = IterativeImputer(max_iter=10,
+                               verbose=1,
+                               random_state=rng).fit(X_train)
+    X_test_est = imputer.transform(X_test)
+    assert_allclose(X_test_filled, X_test_est, atol=0.1)
+
+
+def test_iterative_imputer_additive_matrix():
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    A = rng.randn(n, d)
+    B = rng.randn(n, d)
+    X_filled = np.zeros(A.shape)
+    for i in range(d):
+        for j in range(d):
+            X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2
+    # a quarter is randomly missing
+    nan_mask = rng.rand(n, d) < 0.25
+    X_missing = X_filled.copy()
+    X_missing[nan_mask] = np.nan
+
+    # split up data
+    n = n // 2
+    X_train = X_missing[:n]
+    X_test_filled = X_filled[n:]
+    X_test = X_missing[n:]
+
+    imputer = IterativeImputer(max_iter=10,
+                               verbose=1,
+                               random_state=rng).fit(X_train)
+    X_test_est = imputer.transform(X_test)
+    assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
+
+
+@pytest.mark.parametrize("max_iter, tol, error_type, warning", [
+    (-1, 1e-3, ValueError, 'should be a positive integer'),
+    (1, -1e-3, ValueError, 'should be a non-negative float')
+])
+def test_iterative_imputer_error_param(max_iter, tol, error_type, warning):
+    X = np.zeros((100, 2))
+    imputer = IterativeImputer(max_iter=max_iter, tol=tol)
+    with pytest.raises(error_type, match=warning):
+        imputer.fit_transform(X)
+
+
+def test_iterative_imputer_early_stopping():
+    rng = np.random.RandomState(0)
+    n = 50
+    d = 5
+    A = rng.rand(n, 1)
+    B = rng.rand(1, d)
+    X = np.dot(A, B)
+    nan_mask = rng.rand(n, d) < 0.5
+    X_missing = X.copy()
+    X_missing[nan_mask] = np.nan
+
+    imputer = IterativeImputer(max_iter=100,
+                               tol=1e-3,
+                               sample_posterior=False,
+                               verbose=1,
+                               random_state=rng)
+    X_filled_100 = imputer.fit_transform(X_missing)
+    assert len(imputer.imputation_sequence_) == d * imputer.n_iter_
+
+    imputer = IterativeImputer(max_iter=imputer.n_iter_,
+                               sample_posterior=False,
+                               verbose=1,
+                               random_state=rng)
+    X_filled_early = imputer.fit_transform(X_missing)
+    assert_allclose(X_filled_100, X_filled_early, atol=1e-7)
+
+    imputer = IterativeImputer(max_iter=100,
+                               tol=0,
+                               sample_posterior=False,
+                               verbose=1,
+                               random_state=rng)
+    imputer.fit(X_missing)
+    assert imputer.n_iter_ == imputer.max_iter
+
+
 @pytest.mark.parametrize(
     "X_fit, X_trans, params, msg_err",
     [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]),
@@ -650,7 +1062,7 @@ def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
 
 
 @pytest.mark.parametrize("imputer_constructor",
-                         [SimpleImputer])
+                         [SimpleImputer, IterativeImputer])
 @pytest.mark.parametrize(
     "imputer_missing_values, missing_value, err_msg",
     [("NaN", np.nan, "Input contains NaN"),
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index af426f048afd4..6c4196d919aa0 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -72,9 +72,11 @@
                 'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression',
                 'RANSACRegressor', 'RadiusNeighborsRegressor',
                 'RandomForestRegressor', 'Ridge', 'RidgeCV']
+
 ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator',
              'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
-             'PowerTransformer', 'QuantileTransformer']
+             'PowerTransformer', 'QuantileTransformer', 'IterativeImputer']
+
 SUPPORT_STRING = ['SimpleImputer', 'MissingIndicator']