From dc67ec0a0b14d40f54abd386e3fc5a6aee092dee Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Mon, 3 Sep 2018 13:23:20 +1000
Subject: [PATCH 01/20] FEA Reinstate ChainedImputer

This reverts commit f819704880b5d6affb49996f832f1aa5c8799571.
---
 doc/modules/classes.rst           |   3 +-
 doc/modules/impute.rst            |  71 +++-
 doc/whats_new/v0.20.rst           |   1 -
 doc/whats_new/v0.21.rst           |   8 +
 examples/plot_missing_values.py   |  28 +-
 sklearn/impute.py                 | 558 +++++++++++++++++++++++++++++-
 sklearn/tests/test_impute.py      | 233 ++++++++++++-
 sklearn/utils/estimator_checks.py |   2 +-
 8 files changed, 887 insertions(+), 17 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 57ccfb5cff704..2d3174a9dcc05 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -656,8 +656,9 @@ Kernels:
    :template: class.rst
 
    impute.SimpleImputer
+   impute.ChainedImputer
    impute.MissingIndicator
-
+   
 .. _kernel_approximation_ref:
 
 :mod:`sklearn.kernel_approximation` Kernel Approximation
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 0fd119857177b..268ce1c3ede19 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -16,6 +16,22 @@ values. However, this comes at the price of losing data which may be valuable
 i.e., to infer them from the known part of the data. See the :ref:`glossary`
 entry on imputation.
 
+
+Univariate vs. Multivariate Imputation
+======================================
+
+One type of imputation algorithm is univariate, which imputes values in the i-th
+feature dimension using only non-missing values in that feature dimension
+(e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation
+algorithms use the entire set of available feature dimensions to estimate the
+missing values (e.g. :class:`impute.ChainedImputer`).
+
+
+.. _single_imputer:
+
+Univariate feature imputation
+=============================
+
 The :class:`SimpleImputer` class provides basic strategies for imputing missing
 values. Missing values can be imputed with a provided constant value, or using
 the statistics (mean, median or most frequent) of each column in which the
@@ -71,9 +87,60 @@ string values or pandas categoricals when using the ``'most_frequent'`` or
      ['a' 'y']
      ['b' 'y']]
 
+.. _chained_imputer:
+
+
+Multivariate feature imputation
+===============================
 
-:class:`SimpleImputer` can be used in a Pipeline as a way to build a composite
-estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`.
+A more sophisticated approach is to use the :class:`ChainedImputer` class, which
+implements the imputation technique from MICE (Multivariate Imputation by
+Chained Equations). MICE models each feature with missing values as a function of
+other features, and uses that estimate for imputation. It does so in a round-robin
+fashion: at each step, a feature column is designated as output `y` and the other
+feature columns are treated as inputs `X`. A regressor is fit on `(X, y)` for known `y`.
+Then, the regressor is used to predict the unknown values of `y`. This is repeated
+for each feature in a chained fashion, and then is done for a number of imputation
+rounds. Here is an example snippet::
+
+    >>> import numpy as np
+    >>> from sklearn.impute import ChainedImputer
+    >>> imp = ChainedImputer(n_imputations=10, random_state=0)
+    >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
+    ChainedImputer(imputation_order='ascending', initial_strategy='mean',
+            max_value=None, min_value=None, missing_values=nan, n_burn_in=10,
+            n_imputations=10, n_nearest_features=None, predictor=None,
+            random_state=0, verbose=False)
+    >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
+    >>> print(np.round(imp.transform(X_test)))
+    [[ 1.  2.]
+     [ 6.  4.]
+     [13.  6.]]
+
+Both :class:`SimpleImputer` and :class:`ChainedImputer` can be used in a Pipeline
+as a way to build a composite estimator that supports imputation.
+See :ref:`sphx_glr_auto_examples_plot_missing_values.py`.
+
+.. _multiple_imputation:
+
+Multiple vs. Single Imputation
+==============================
+
+In the statistics community, it is common practice to perform multiple imputations,
+generating, for example, 10 separate imputations for a single feature matrix.
+Each of these 10 imputations is then put through the subsequent analysis pipeline
+(e.g. feature engineering, clustering, regression, classification). The 10 final
+analysis results (e.g. held-out validation error) allow the data scientist to
+obtain understanding of the uncertainty inherent in the missing values. The above
+practice is called multiple imputation. As implemented, the :class:`ChainedImputer`
+class generates a single (averaged) imputation for each missing value because this
+is the most common use case for machine learning applications. However, it can also be used
+for multiple imputations by applying it repeatedly to the same dataset with different
+random seeds with the ``n_imputations`` parameter set to 1.
+
+Note that a call to the ``transform`` method of :class:`ChainedImputer` is not
+allowed to change the number of samples. Therefore multiple imputations cannot be
+achieved by a single call to ``transform``.
 
 .. _missing_indicator:
 
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 2ed336b782174..402b7c178c8dd 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -155,7 +155,6 @@ Support for Python 3.3 has been officially dropped.
   :class:`cluster.AgglomerativeClustering`.
   :issue:`9875` by :user:`Kumar Ashutosh <thechargedneutron>`.
 
-
 :mod:`sklearn.compose`
 ......................
 
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 202972f0575c0..2c010e5b1be59 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -40,6 +40,14 @@ Support for Python 3.4 and below has been officially dropped.
 - An entry goes here
 - An entry goes here
 
+:mod:`sklearn.impute`
+.....................
+
+- |MajorFeature| Added :class:`impute.ChainedImputer`, which is a strategy for
+  imputing missing values by modeling each feature with missing values as a
+  function of other features in a round-robin fashion. :issue:`8478` by
+  :user:`Sergey Feldman <sergeyf>`.
+
 Multiple modules
 ................
 
diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py
index 755943fb55bda..3ab1cfff95576 100644
--- a/examples/plot_missing_values.py
+++ b/examples/plot_missing_values.py
@@ -3,21 +3,22 @@
 Imputing missing values before building an estimator
 ====================================================
 
-This example shows that imputing the missing values can give better
-results than discarding the samples containing any missing value.
-Imputing does not always improve the predictions, so please check via
-cross-validation.  Sometimes dropping rows or using marker values is
-more effective.
-
 Missing values can be replaced by the mean, the median or the most frequent
 value using the basic :func:`sklearn.impute.SimpleImputer`.
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
+Another option is the :func:`sklearn.impute.ChainedImputer`. This uses
+round-robin linear regression, treating every variable as an output in
+turn. The version implemented assumes Gaussian (output) variables. If your
+features are obviously non-Normal, consider transforming them to look more
+Normal so as to improve performance.
+
 In addition of using an imputing method, we can also keep an indication of the
 missing information using :func:`sklearn.impute.MissingIndicator` which might
 carry some information.
 """
+
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -25,7 +26,7 @@
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import make_pipeline, make_union
-from sklearn.impute import SimpleImputer, MissingIndicator
+from sklearn.impute import SimpleImputer, ChainedImputer, MissingIndicator
 from sklearn.model_selection import cross_val_score
 
 rng = np.random.RandomState(0)
@@ -72,10 +73,18 @@ def get_results(dataset):
                                          scoring='neg_mean_squared_error',
                                          cv=5)
 
+    # Estimate the score after chained imputation of the missing values
+    estimator = make_pipeline(
+        make_union(ChainedImputer(missing_values=0, random_state=0),
+                   MissingIndicator(missing_values=0)),
+        RandomForestRegressor(random_state=0, n_estimators=100))
+    chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
+                                            scoring='neg_mean_squared_error')
 
     return ((full_scores.mean(), full_scores.std()),
             (zero_impute_scores.mean(), zero_impute_scores.std()),
-            (mean_impute_scores.mean(), mean_impute_scores.std()))
+            (mean_impute_scores.mean(), mean_impute_scores.std()),
+            (chained_impute_scores.mean(), chained_impute_scores.std()))
 
 
 results_diabetes = np.array(get_results(load_diabetes()))
@@ -91,7 +100,8 @@ def get_results(dataset):
 
 x_labels = ['Full data',
             'Zero imputation',
-            'Mean Imputation']
+            'Mean Imputation',
+            'Chained Imputation']
 colors = ['r', 'g', 'b', 'orange']
 
 # plot diabetes results
diff --git a/sklearn/impute.py b/sklearn/impute.py
index e98c425d1b34f..b9afbc76bd2d8 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -3,16 +3,22 @@
 #          Sergey Feldman <sergeyfeldman@gmail.com>
 # License: BSD 3 clause
 
+from __future__ import division
+
 import warnings
 import numbers
+import time
 
 import numpy as np
 import numpy.ma as ma
 from scipy import sparse
 from scipy import stats
+from collections import namedtuple
 
 from .base import BaseEstimator, TransformerMixin
-from .utils import check_array
+from .base import clone
+from .preprocessing import normalize
+from .utils import check_array, check_random_state, safe_indexing
 from .utils.sparsefuncs import _get_median
 from .utils.validation import check_is_fitted
 from .utils.validation import FLOAT_DTYPES
@@ -24,9 +30,14 @@
 zip = six.moves.zip
 map = six.moves.map
 
+ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx',
+                                               'neighbor_feat_idx',
+                                               'predictor'])
+
 __all__ = [
     'MissingIndicator',
     'SimpleImputer',
+    'ChainedImputer',
 ]
 
 
@@ -409,6 +420,551 @@ def transform(self, X):
         return X
 
 
+class ChainedImputer(BaseEstimator, TransformerMixin):
+    """Chained imputer transformer to impute missing values.
+
+    Basic implementation of chained imputer from MICE (Multivariate
+    Imputations by Chained Equations) package from R. This version assumes all
+    of the features are Gaussian.
+
+    Read more in the :ref:`User Guide <mice>`.
+
+    Parameters
+    ----------
+    missing_values : int, np.nan, optional (default=np.nan)
+        The placeholder for the missing values. All occurrences of
+        ``missing_values`` will be imputed.
+
+    imputation_order : str, optional (default="ascending")
+        The order in which the features will be imputed. Possible values:
+
+        "ascending"
+            From features with fewest missing values to most.
+        "descending"
+            From features with most missing values to fewest.
+        "roman"
+            Left to right.
+        "arabic"
+            Right to left.
+        "random"
+            A random order for each round.
+
+    n_imputations : int, optional (default=100)
+        Number of chained imputation rounds to perform, the results of which
+        will be used in the final average.
+
+    n_burn_in : int, optional (default=10)
+        Number of initial imputation rounds to perform the results of which
+        will not be returned.
+
+    predictor : estimator object, default=BayesianRidge()
+        The predictor to use at each step of the round-robin imputation.
+        It must support ``return_std`` in its ``predict`` method.
+
+    n_nearest_features : int, optional (default=None)
+        Number of other features to use to estimate the missing values of
+        the each feature column. Nearness between features is measured using
+        the absolute correlation coefficient between each feature pair (after
+        initial imputation). Can provide significant speed-up when the number
+        of features is huge. If ``None``, all features will be used.
+
+    initial_strategy : str, optional (default="mean")
+        Which strategy to use to initialize the missing values. Same as the
+        ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
+        Valid values: {"mean", "median", "most_frequent", or "constant"}.
+
+    min_value : float, optional (default=None)
+        Minimum possible imputed value. Default of ``None`` will set minimum
+        to negative infinity.
+
+    max_value : float, optional (default=None)
+        Maximum possible imputed value. Default of ``None`` will set maximum
+        to positive infinity.
+
+    verbose : int, optional (default=0)
+        Verbosity flag, controls the debug messages that are issued
+        as functions are evaluated. The higher, the more verbose. Can be 0, 1,
+        or 2.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by ``np.random``.
+
+    Attributes
+    ----------
+    initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`'
+        The imputer used to initialize the missing values.
+
+    imputation_sequence_ : list of tuples
+        Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where
+        ``feat_idx`` is the current feature to be imputed,
+        ``neighbor_feat_idx`` is the array of other features used to impute the
+        current feature, and ``predictor`` is the trained predictor used for
+        the imputation.
+
+    Notes
+    -----
+    The R version of MICE does not have inductive functionality, i.e. first
+    fitting on ``X_train`` and then transforming any ``X_test`` without
+    additional fitting. We do this by storing each feature's predictor during
+    the round-robin ``fit`` phase, and predicting without refitting (in order)
+    during the ``transform`` phase.
+
+    Features which contain all missing values at ``fit`` are discarded upon
+    ``transform``.
+
+    Features with missing values in transform which did not have any missing
+    values in fit will be imputed with the initial imputation method only.
+
+    References
+    ----------
+    .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
+        Multivariate Imputation by Chained Equations in R". Journal of
+        Statistical Software 45: 1-67.
+        <https://www.jstatsoft.org/article/view/v045i03>`_
+    """
+
+    def __init__(self,
+                 missing_values=np.nan,
+                 imputation_order='ascending',
+                 n_imputations=100,
+                 n_burn_in=10,
+                 predictor=None,
+                 n_nearest_features=None,
+                 initial_strategy="mean",
+                 min_value=None,
+                 max_value=None,
+                 verbose=False,
+                 random_state=None):
+
+        self.missing_values = missing_values
+        self.imputation_order = imputation_order
+        self.n_imputations = n_imputations
+        self.n_burn_in = n_burn_in
+        self.predictor = predictor
+        self.n_nearest_features = n_nearest_features
+        self.initial_strategy = initial_strategy
+        self.min_value = min_value
+        self.max_value = max_value
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def _impute_one_feature(self,
+                            X_filled,
+                            mask_missing_values,
+                            feat_idx,
+                            neighbor_feat_idx,
+                            predictor=None,
+                            fit_mode=True):
+        """Impute a single feature from the others provided.
+
+        This function predicts the missing values of one of the features using
+        the current estimates of all the other features. The ``predictor`` must
+        support ``return_std=True`` in its ``predict`` method for this function
+        to work.
+
+        Parameters
+        ----------
+        X_filled : ndarray
+            Input data with the most recent imputations.
+
+        mask_missing_values : ndarray
+            Input data's missing indicator matrix.
+
+        feat_idx : int
+            Index of the feature currently being imputed.
+
+        neighbor_feat_idx : ndarray
+            Indices of the features to be used in imputing ``feat_idx``.
+
+        predictor : object
+            The predictor to use at this step of the round-robin imputation.
+            It must support ``return_std`` in its ``predict`` method.
+            If None, it will be cloned from self._predictor.
+
+        fit_mode : boolean, default=True
+            Whether to fit and predict with the predictor or just predict.
+
+        Returns
+        -------
+        X_filled : ndarray
+            Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
+
+        predictor : predictor with sklearn API
+            The fitted predictor used to impute
+            ``X_filled[missing_row_mask, feat_idx]``.
+        """
+
+        # if nothing is missing, just return the default
+        # (should not happen at fit time because feat_ids would be excluded)
+        missing_row_mask = mask_missing_values[:, feat_idx]
+        if not np.any(missing_row_mask):
+            return X_filled, predictor
+
+        if predictor is None and fit_mode is False:
+            raise ValueError("If fit_mode is False, then an already-fitted "
+                             "predictor should be passed in.")
+
+        if predictor is None:
+            predictor = clone(self._predictor)
+
+        if fit_mode:
+            X_train = safe_indexing(X_filled[:, neighbor_feat_idx],
+                                    ~missing_row_mask)
+            y_train = safe_indexing(X_filled[:, feat_idx],
+                                    ~missing_row_mask)
+            predictor.fit(X_train, y_train)
+
+        # get posterior samples
+        X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
+                               missing_row_mask)
+        mus, sigmas = predictor.predict(X_test, return_std=True)
+        good_sigmas = sigmas > 0
+        imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+        imputed_values[~good_sigmas] = mus[~good_sigmas]
+        imputed_values[good_sigmas] = self.random_state_.normal(
+            loc=mus[good_sigmas], scale=sigmas[good_sigmas])
+
+        # clip the values
+        imputed_values = np.clip(imputed_values,
+                                 self._min_value,
+                                 self._max_value)
+
+        # update the feature
+        X_filled[missing_row_mask, feat_idx] = imputed_values
+        return X_filled, predictor
+
+    def _get_neighbor_feat_idx(self,
+                               n_features,
+                               feat_idx,
+                               abs_corr_mat):
+        """Get a list of other features to predict ``feat_idx``.
+
+        If self.n_nearest_features is less than or equal to the total
+        number of features, then use a probability proportional to the absolute
+        correlation between ``feat_idx`` and each other feature to randomly
+        choose a subsample of the other features (without replacement).
+
+        Parameters
+        ----------
+        n_features : int
+            Number of features in ``X``.
+
+        feat_idx : int
+            Index of the feature currently being imputed.
+
+        abs_corr_mat : ndarray, shape (n_features, n_features)
+            Absolute correlation matrix of ``X``. The diagonal has been zeroed
+            out and each feature has been normalized to sum to 1. Can be None.
+
+        Returns
+        -------
+        neighbor_feat_idx : array-like
+            The features to use to impute ``feat_idx``.
+        """
+        if (self.n_nearest_features is not None and
+                self.n_nearest_features < n_features):
+            p = abs_corr_mat[:, feat_idx]
+            neighbor_feat_idx = self.random_state_.choice(
+                np.arange(n_features), self.n_nearest_features, replace=False,
+                p=p)
+        else:
+            inds_left = np.arange(feat_idx)
+            inds_right = np.arange(feat_idx + 1, n_features)
+            neighbor_feat_idx = np.concatenate((inds_left, inds_right))
+        return neighbor_feat_idx
+
+    def _get_ordered_idx(self, mask_missing_values):
+        """Decide in what order we will update the features.
+
+        As a homage to the MICE R package, we will have 4 main options of
+        how to order the updates, and use a random order if anything else
+        is specified.
+
+        Also, this function skips features which have no missing values.
+
+        Parameters
+        ----------
+        mask_missing_values : array-like, shape (n_samples, n_features)
+            Input data's missing indicator matrix, where "n_samples" is the
+            number of samples and "n_features" is the number of features.
+
+        Returns
+        -------
+        ordered_idx : ndarray, shape (n_features,)
+            The order in which to impute the features.
+        """
+        frac_of_missing_values = mask_missing_values.mean(axis=0)
+        missing_values_idx = np.nonzero(frac_of_missing_values)[0]
+        if self.imputation_order == 'roman':
+            ordered_idx = missing_values_idx
+        elif self.imputation_order == 'arabic':
+            ordered_idx = missing_values_idx[::-1]
+        elif self.imputation_order == 'ascending':
+            n = len(frac_of_missing_values) - len(missing_values_idx)
+            ordered_idx = np.argsort(frac_of_missing_values,
+                                     kind='mergesort')[n:][::-1]
+        elif self.imputation_order == 'descending':
+            n = len(frac_of_missing_values) - len(missing_values_idx)
+            ordered_idx = np.argsort(frac_of_missing_values,
+                                     kind='mergesort')[n:]
+        elif self.imputation_order == 'random':
+            ordered_idx = missing_values_idx
+            self.random_state_.shuffle(ordered_idx)
+        else:
+            raise ValueError("Got an invalid imputation order: '{0}'. It must "
+                             "be one of the following: 'roman', 'arabic', "
+                             "'ascending', 'descending', or "
+                             "'random'.".format(self.imputation_order))
+        return ordered_idx
+
+    def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
+        """Get absolute correlation matrix between features.
+
+        Parameters
+        ----------
+        X_filled : ndarray, shape (n_samples, n_features)
+            Input data with the most recent imputations.
+
+        tolerance : float, optional (default=1e-6)
+            ``abs_corr_mat`` can have nans, which will be replaced
+            with ``tolerance``.
+
+        Returns
+        -------
+        abs_corr_mat : ndarray, shape (n_features, n_features)
+            Absolute correlation matrix of ``X`` at the beginning of the
+            current round. The diagonal has been zeroed out and each feature's
+            absolute correlations with all others have been normalized to sum
+            to 1.
+        """
+        n_features = X_filled.shape[1]
+        if (self.n_nearest_features is None or
+                self.n_nearest_features >= n_features):
+            return None
+        abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
+        # np.corrcoef is not defined for features with zero std
+        abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
+        # ensures exploration, i.e. at least some probability of sampling
+        np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
+        # features are not their own neighbors
+        np.fill_diagonal(abs_corr_mat, 0)
+        # needs to sum to 1 for np.random.choice sampling
+        abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
+        return abs_corr_mat
+
+    def _initial_imputation(self, X):
+        """Perform initial imputation for input X.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        Returns
+        -------
+        Xt : ndarray, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        X_filled : ndarray, shape (n_samples, n_features)
+            Input data with the most recent imputations.
+
+        mask_missing_values : ndarray, shape (n_samples, n_features)
+            Input data's missing indicator matrix, where "n_samples" is the
+            number of samples and "n_features" is the number of features.
+        """
+        if is_scalar_nan(self.missing_values):
+            force_all_finite = "allow-nan"
+        else:
+            force_all_finite = True
+
+        X = check_array(X, dtype=FLOAT_DTYPES, order="F",
+                        force_all_finite=force_all_finite)
+        _check_inputs_dtype(X, self.missing_values)
+
+        mask_missing_values = _get_mask(X, self.missing_values)
+        if self.initial_imputer_ is None:
+            self.initial_imputer_ = SimpleImputer(
+                                            missing_values=self.missing_values,
+                                            strategy=self.initial_strategy)
+            X_filled = self.initial_imputer_.fit_transform(X)
+        else:
+            X_filled = self.initial_imputer_.transform(X)
+
+        valid_mask = np.flatnonzero(np.logical_not(
+            np.isnan(self.initial_imputer_.statistics_)))
+        Xt = X[:, valid_mask]
+        mask_missing_values = mask_missing_values[:, valid_mask]
+
+        return Xt, X_filled, mask_missing_values
+
+    def fit_transform(self, X, y=None):
+        """Fits the imputer on X and return the transformed X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        y : ignored.
+
+        Returns
+        -------
+        Xt : array-like, shape (n_samples, n_features)
+             The imputed input data.
+        """
+        self.random_state_ = getattr(self, "random_state_",
+                                     check_random_state(self.random_state))
+
+        if self.predictor is None:
+            from .linear_model import BayesianRidge
+            self._predictor = BayesianRidge()
+        else:
+            self._predictor = clone(self.predictor)
+
+        self._min_value = np.nan if self.min_value is None else self.min_value
+        self._max_value = np.nan if self.max_value is None else self.max_value
+
+        self.initial_imputer_ = None
+        X, X_filled, mask_missing_values = self._initial_imputation(X)
+
+        # edge case: in case the user specifies 0 for n_imputations,
+        # then there is no need to do burn in and the result should be
+        # just the initial imputation (before clipping)
+        if self.n_imputations < 1:
+            return X_filled
+
+        X_filled = np.clip(X_filled, self._min_value, self._max_value)
+
+        # order in which to impute
+        # note this is probably too slow for large feature data (d > 100000)
+        # and a better way would be good.
+        # see: https://goo.gl/KyCNwj and subsequent comments
+        ordered_idx = self._get_ordered_idx(mask_missing_values)
+
+        abs_corr_mat = self._get_abs_corr_mat(X_filled)
+
+        # impute data
+        n_rounds = self.n_burn_in + self.n_imputations
+        n_samples, n_features = X_filled.shape
+        Xt = np.zeros((n_samples, n_features), dtype=X.dtype)
+        self.imputation_sequence_ = []
+        if self.verbose > 0:
+            print("[ChainedImputer] Completing matrix with shape %s"
+                  % (X.shape,))
+        start_t = time()
+        for i_rnd in range(n_rounds):
+            if self.imputation_order == 'random':
+                ordered_idx = self._get_ordered_idx(mask_missing_values)
+
+            for feat_idx in ordered_idx:
+                neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
+                                                                feat_idx,
+                                                                abs_corr_mat)
+                X_filled, predictor = self._impute_one_feature(
+                    X_filled, mask_missing_values, feat_idx, neighbor_feat_idx,
+                    predictor=None, fit_mode=True)
+                predictor_triplet = ImputerTriplet(feat_idx,
+                                                   neighbor_feat_idx,
+                                                   predictor)
+                self.imputation_sequence_.append(predictor_triplet)
+
+            if i_rnd >= self.n_burn_in:
+                Xt += X_filled
+            if self.verbose > 0:
+                print('[ChainedImputer] Ending imputation round '
+                      '%d/%d, elapsed time %0.2f'
+                      % (i_rnd + 1, n_rounds, time() - start_t))
+
+        Xt /= self.n_imputations
+        Xt[~mask_missing_values] = X[~mask_missing_values]
+        return Xt
+
+    def transform(self, X):
+        """Imputes all missing values in X.
+
+        Note that this is stochastic, and that if random_state is not fixed,
+        repeated calls, or permuted input, will yield different results.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            The input data to complete.
+
+        Returns
+        -------
+        Xt : array-like, shape (n_samples, n_features)
+             The imputed input data.
+        """
+        check_is_fitted(self, 'initial_imputer_')
+
+        X, X_filled, mask_missing_values = self._initial_imputation(X)
+
+        # edge case: in case the user specifies 0 for n_imputations,
+        # then there is no need to do burn in and the result should be
+        # just the initial imputation (before clipping)
+        if self.n_imputations < 1:
+            return X_filled
+
+        X_filled = np.clip(X_filled, self._min_value, self._max_value)
+
+        n_rounds = self.n_burn_in + self.n_imputations
+        n_imputations = len(self.imputation_sequence_)
+        imputations_per_round = n_imputations // n_rounds
+        i_rnd = 0
+        Xt = np.zeros(X.shape, dtype=X.dtype)
+        if self.verbose > 0:
+            print("[ChainedImputer] Completing matrix with shape %s"
+                  % (X.shape,))
+        start_t = time()
+        for it, predictor_triplet in enumerate(self.imputation_sequence_):
+            X_filled, _ = self._impute_one_feature(
+                X_filled,
+                mask_missing_values,
+                predictor_triplet.feat_idx,
+                predictor_triplet.neighbor_feat_idx,
+                predictor=predictor_triplet.predictor,
+                fit_mode=False
+            )
+            if not (it + 1) % imputations_per_round:
+                if i_rnd >= self.n_burn_in:
+                    Xt += X_filled
+                if self.verbose > 1:
+                    print('[ChainedImputer] Ending imputation round '
+                          '%d/%d, elapsed time %0.2f'
+                          % (i_rnd + 1, n_rounds, time() - start_t))
+                i_rnd += 1
+
+        Xt /= self.n_imputations
+        Xt[~mask_missing_values] = X[~mask_missing_values]
+        return Xt
+
+    def fit(self, X, y=None):
+        """Fits the imputer on X and return self.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input data, where "n_samples" is the number of samples and
+            "n_features" is the number of features.
+
+        y : ignored
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        self.fit_transform(X)
+        return self
+
+
 class MissingIndicator(BaseEstimator, TransformerMixin):
     """Binary indicators for missing values.
 
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index f25f76e611d77..f9c3e4902f145 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -1,3 +1,5 @@
+from __future__ import division
+
 import pytest
 
 import numpy as np
@@ -12,7 +14,9 @@
 from sklearn.utils.testing import assert_false
 
 from sklearn.impute import MissingIndicator
-from sklearn.impute import SimpleImputer
+from sklearn.impute import SimpleImputer, ChainedImputer
+from sklearn.dummy import DummyRegressor
+from sklearn.linear_model import BayesianRidge, ARDRegression
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn import tree
@@ -69,6 +73,10 @@ def test_imputation_shape():
         X_imputed = imputer.fit_transform(X)
         assert X_imputed.shape == (10, 2)
 
+        chained_imputer = ChainedImputer(initial_strategy=strategy)
+        X_imputed = chained_imputer.fit_transform(X)
+        assert X_imputed.shape == (10, 2)
+
 
 @pytest.mark.parametrize("strategy", ["const", 101, None])
 def test_imputation_error_invalid_strategy(strategy):
@@ -500,6 +508,227 @@ def test_imputation_copy():
     # made, even if copy=False.
 
 
+def test_chained_imputer_rank_one():
+    rng = np.random.RandomState(0)
+    d = 100
+    A = rng.rand(d, 1)
+    B = rng.rand(1, d)
+    X = np.dot(A, B)
+    nan_mask = rng.rand(d, d) < 0.5
+    X_missing = X.copy()
+    X_missing[nan_mask] = np.nan
+
+    imputer = ChainedImputer(n_imputations=5,
+                             n_burn_in=5,
+                             verbose=True,
+                             random_state=rng)
+    X_filled = imputer.fit_transform(X_missing)
+    assert_allclose(X_filled, X, atol=0.001)
+
+
+@pytest.mark.parametrize(
+    "imputation_order",
+    ['random', 'roman', 'ascending', 'descending', 'arabic']
+)
+def test_chained_imputer_imputation_order(imputation_order):
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    X[:, 0] = 1  # this column should not be discarded by ChainedImputer
+
+    imputer = ChainedImputer(missing_values=0,
+                             n_imputations=1,
+                             n_burn_in=1,
+                             n_nearest_features=5,
+                             min_value=0,
+                             max_value=1,
+                             verbose=False,
+                             imputation_order=imputation_order,
+                             random_state=rng)
+    imputer.fit_transform(X)
+    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
+    if imputation_order == 'roman':
+        assert np.all(ordered_idx[:d-1] == np.arange(1, d))
+    elif imputation_order == 'arabic':
+        assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1))
+    elif imputation_order == 'random':
+        ordered_idx_round_1 = ordered_idx[:d-1]
+        ordered_idx_round_2 = ordered_idx[d-1:]
+        assert ordered_idx_round_1 != ordered_idx_round_2
+    elif 'ending' in imputation_order:
+        assert len(ordered_idx) == 2 * (d - 1)
+
+
+@pytest.mark.parametrize(
+    "predictor",
+    [DummyRegressor(), BayesianRidge(), ARDRegression()]
+)
+def test_chained_imputer_predictors(predictor):
+    rng = np.random.RandomState(0)
+
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+
+    imputer = ChainedImputer(missing_values=0,
+                             n_imputations=1,
+                             n_burn_in=1,
+                             predictor=predictor,
+                             random_state=rng)
+    imputer.fit_transform(X)
+
+    # check that types are correct for predictors
+    hashes = []
+    for triplet in imputer.imputation_sequence_:
+        assert triplet.predictor
+        hashes.append(id(triplet.predictor))
+
+    # check that each predictor is unique
+    assert len(set(hashes)) == len(hashes)
+
+
+def test_chained_imputer_clip():
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10,
+                             random_state=rng).toarray()
+
+    imputer = ChainedImputer(missing_values=0,
+                             n_imputations=1,
+                             n_burn_in=1,
+                             min_value=0.1,
+                             max_value=0.2,
+                             random_state=rng)
+
+    Xt = imputer.fit_transform(X)
+    assert_allclose(np.min(Xt[X == 0]), 0.1)
+    assert_allclose(np.max(Xt[X == 0]), 0.2)
+    assert_allclose(Xt[X != 0], X[X != 0])
+
+
+@pytest.mark.parametrize(
+    "strategy",
+    ["mean", "median", "most_frequent"]
+)
+def test_chained_imputer_missing_at_transform(strategy):
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X_train = rng.randint(low=0, high=3, size=(n, d))
+    X_test = rng.randint(low=0, high=3, size=(n, d))
+
+    X_train[:, 0] = 1  # definitely no missing values in 0th column
+    X_test[0, 0] = 0  # definitely missing value in 0th column
+
+    imputer = ChainedImputer(missing_values=0,
+                             n_imputations=1,
+                             n_burn_in=1,
+                             initial_strategy=strategy,
+                             random_state=rng).fit(X_train)
+    initial_imputer = SimpleImputer(missing_values=0,
+                                    strategy=strategy).fit(X_train)
+
+    # if there were no missing values at time of fit, then imputer will
+    # only use the initial imputer for that feature at transform
+    assert np.all(imputer.transform(X_test)[:, 0] ==
+                  initial_imputer.transform(X_test)[:, 0])
+
+
+def test_chained_imputer_transform_stochasticity():
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10,
+                             random_state=rng).toarray()
+
+    imputer = ChainedImputer(missing_values=0,
+                             n_imputations=1,
+                             n_burn_in=1,
+                             random_state=rng)
+    imputer.fit(X)
+
+    X_fitted_1 = imputer.transform(X)
+    X_fitted_2 = imputer.transform(X)
+
+    # sufficient to assert that the means are not the same
+    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
+
+
+def test_chained_imputer_no_missing():
+    rng = np.random.RandomState(0)
+    X = rng.rand(100, 100)
+    X[:, 0] = np.nan
+    m1 = ChainedImputer(n_imputations=10, random_state=rng)
+    m2 = ChainedImputer(n_imputations=10, random_state=rng)
+    pred1 = m1.fit(X).transform(X)
+    pred2 = m2.fit_transform(X)
+    # should exclude the first column entirely
+    assert_allclose(X[:, 1:], pred1)
+    # fit and fit_transform should both be identical
+    assert_allclose(pred1, pred2)
+
+
+@pytest.mark.parametrize(
+    "rank",
+    [3, 5]
+)
+def test_chained_imputer_transform_recovery(rank):
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 100
+    A = rng.rand(n, rank)
+    B = rng.rand(rank, d)
+    X_filled = np.dot(A, B)
+    # half is randomly missing
+    nan_mask = rng.rand(n, d) < 0.5
+    X_missing = X_filled.copy()
+    X_missing[nan_mask] = np.nan
+
+    # split up data in half
+    n = n // 2
+    X_train = X_missing[:n]
+    X_test_filled = X_filled[n:]
+    X_test = X_missing[n:]
+
+    imputer = ChainedImputer(n_imputations=10,
+                             n_burn_in=10,
+                             verbose=True,
+                             random_state=rng).fit(X_train)
+    X_test_est = imputer.transform(X_test)
+    assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
+
+
+def test_chained_imputer_additive_matrix():
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    A = rng.randn(n, d)
+    B = rng.randn(n, d)
+    X_filled = np.zeros(A.shape)
+    for i in range(d):
+        for j in range(d):
+            X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2
+    # a quarter is randomly missing
+    nan_mask = rng.rand(n, d) < 0.25
+    X_missing = X_filled.copy()
+    X_missing[nan_mask] = np.nan
+
+    # split up data
+    n = n // 2
+    X_train = X_missing[:n]
+    X_test_filled = X_filled[n:]
+    X_test = X_missing[n:]
+
+    imputer = ChainedImputer(n_imputations=25,
+                             n_burn_in=10,
+                             verbose=True,
+                             random_state=rng).fit(X_train)
+    X_test_est = imputer.transform(X_test)
+    assert_allclose(X_test_filled, X_test_est, atol=0.01)
+
+
 @pytest.mark.parametrize(
     "X_fit, X_trans, params, msg_err",
     [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]),
@@ -616,7 +845,7 @@ def test_missing_indicator_sparse_param(arr_type, missing_values,
 
 
 @pytest.mark.parametrize("imputer_constructor",
-                         [SimpleImputer])
+                         [SimpleImputer, ChainedImputer])
 @pytest.mark.parametrize(
     "imputer_missing_values, missing_value, err_msg",
     [("NaN", np.nan, "Input contains NaN"),
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 54369033a75d3..931e50d920402 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -77,7 +77,7 @@
                 'RANSACRegressor', 'RadiusNeighborsRegressor',
                 'RandomForestRegressor', 'Ridge', 'RidgeCV']
 
-ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator',
+ALLOW_NAN = ['Imputer', 'SimpleImputer', 'ChainedImputer', 'MissingIndicator',
              'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
              'PowerTransformer', 'QuantileTransformer']
 

From cbf89ecfd74138510e158ee2dd0d5fb2ba40557a Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Mon, 3 Sep 2018 14:33:56 +1000
Subject: [PATCH 02/20] Fix import of time

---
 sklearn/impute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index b9afbc76bd2d8..89fb33a4f9034 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -7,7 +7,7 @@
 
 import warnings
 import numbers
-import time
+from time import time
 
 import numpy as np
 import numpy.ma as ma

From a4f2a89546a455a914a08f7e651e5e85823c0c16 Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Mon, 17 Sep 2018 02:15:28 +0300
Subject: [PATCH 03/20] [MRG] ChainedImputer -> IterativeImputer, and
 documentation update (#11350)

Towards making this more generic than MICE
---
 doc/modules/classes.rst           |   2 +-
 doc/modules/impute.rst            |  83 ++++++++-----
 doc/whats_new/v0.21.rst           |   2 +-
 examples/plot_missing_values.py   |  18 +--
 sklearn/impute.py                 | 197 +++++++++++++++++-------------
 sklearn/tests/test_impute.py      | 188 ++++++++++++++++------------
 sklearn/utils/estimator_checks.py |   3 +-
 7 files changed, 283 insertions(+), 210 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 2d3174a9dcc05..ab0f473be4083 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -656,7 +656,7 @@ Kernels:
    :template: class.rst
 
    impute.SimpleImputer
-   impute.ChainedImputer
+   impute.IterativeImputer
    impute.MissingIndicator
    
 .. _kernel_approximation_ref:
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 268ce1c3ede19..8bb3ad8bf940b 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -24,7 +24,7 @@ One type of imputation algorithm is univariate, which imputes values in the i-th
 feature dimension using only non-missing values in that feature dimension
 (e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation
 algorithms use the entire set of available feature dimensions to estimate the
-missing values (e.g. :class:`impute.ChainedImputer`).
+missing values (e.g. :class:`impute.IterativeImputer`).
 
 
 .. _single_imputer:
@@ -87,37 +87,37 @@ string values or pandas categoricals when using the ``'most_frequent'`` or
      ['a' 'y']
      ['b' 'y']]
 
-.. _chained_imputer:
+.. _iterative_imputer:
 
 
 Multivariate feature imputation
 ===============================
 
-A more sophisticated approach is to use the :class:`ChainedImputer` class, which
-implements the imputation technique from MICE (Multivariate Imputation by
-Chained Equations). MICE models each feature with missing values as a function of
-other features, and uses that estimate for imputation. It does so in a round-robin
-fashion: at each step, a feature column is designated as output `y` and the other
-feature columns are treated as inputs `X`. A regressor is fit on `(X, y)` for known `y`.
-Then, the regressor is used to predict the unknown values of `y`. This is repeated
-for each feature in a chained fashion, and then is done for a number of imputation
-rounds. Here is an example snippet::
+A more sophisticated approach is to use the :class:`IterativeImputer` class,
+which models each feature with missing values as a function of other features,
+and uses that estimate for imputation. It does so in an iterated round-robin
+fashion: at each step, a feature column is designated as output ``y`` and the
+other feature columns are treated as inputs ``X``. A regressor is fit on ``(X,
+y)`` for known ``y``. Then, the regressor is used to predict the missing values
+of ``y``.  This is done for each feature in an iterative fashion, and then is
+repeated for ``n_iter`` imputation rounds. The results of the final imputation
+round are returned.
 
     >>> import numpy as np
-    >>> from sklearn.impute import ChainedImputer
-    >>> imp = ChainedImputer(n_imputations=10, random_state=0)
+    >>> from sklearn.impute import IterativeImputer
+    >>> imp = IterativeImputer(n_iter=10, random_state=0)
     >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
-    ChainedImputer(imputation_order='ascending', initial_strategy='mean',
-            max_value=None, min_value=None, missing_values=nan, n_burn_in=10,
-            n_imputations=10, n_nearest_features=None, predictor=None,
-            random_state=0, verbose=False)
+    IterativeImputer(imputation_order='ascending', initial_strategy='mean',
+             max_value=None, min_value=None, missing_values=nan, n_iter=10,
+             n_nearest_features=None, predictor=None, random_state=0,
+             sample_posterior=False, verbose=False)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]
-     [ 6.  4.]
-     [13.  6.]]
+     [ 6.  3.]
+     [24.  6.]]
 
-Both :class:`SimpleImputer` and :class:`ChainedImputer` can be used in a Pipeline
+Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline
 as a way to build a composite estimator that supports imputation.
 See :ref:`sphx_glr_auto_examples_plot_missing_values.py`.
 
@@ -127,21 +127,40 @@ Multiple vs. Single Imputation
 ==============================
 
 In the statistics community, it is common practice to perform multiple imputations,
-generating, for example, 10 separate imputations for a single feature matrix.
-Each of these 10 imputations is then put through the subsequent analysis pipeline
-(e.g. feature engineering, clustering, regression, classification). The 10 final
-analysis results (e.g. held-out validation error) allow the data scientist to
-obtain understanding of the uncertainty inherent in the missing values. The above
-practice is called multiple imputation. As implemented, the :class:`ChainedImputer`
-class generates a single (averaged) imputation for each missing value because this
-is the most common use case for machine learning applications. However, it can also be used
-for multiple imputations by applying it repeatedly to the same dataset with different
-random seeds with the ``n_imputations`` parameter set to 1.
-
-Note that a call to the ``transform`` method of :class:`ChainedImputer` is not
+generating, for example, ``m`` separate imputations for a single feature matrix.
+Each of these ``m`` imputations is then put through the subsequent analysis pipeline
+(e.g. feature engineering, clustering, regression, classification). The ``m`` final
+analysis results (e.g. held-out validation errors) allow the data scientist
+to obtain understanding of how analytic results may differ as a consequence
+of the inherent uncertainty caused by the missing values. The above practice
+is called multiple imputation.
+
+Our implementation of :class:`IterativeImputer` was inspired by the R MICE
+package (Multivariate Imputation by Chained Equations) [1]_, but differs from
+it by returning a single imputation instead of multiple imputations.  However,
+:class:`IterativeImputer` can also be used for multiple imputations by applying
+it repeatedly to the same dataset with different random seeds when
+``sample_posterior=True``. See [2]_, chapter 4 for more discussion on multiple
+vs. single imputations.
+
+It is still an open problem as to how useful single vs. multiple imputation is in
+the context of prediction and classification when the user is not interested in
+measuring uncertainty due to missing values.
+
+Note that a call to the ``transform`` method of :class:`IterativeImputer` is not
 allowed to change the number of samples. Therefore multiple imputations cannot be
 achieved by a single call to ``transform``.
 
+References
+==========
+
+.. [1] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate
+   Imputation by Chained Equations in R". Journal of Statistical Software 45:
+   1-67.
+
+.. [2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis
+   with Missing Data". John Wiley & Sons, Inc., New York, NY, USA.
+
 .. _missing_indicator:
 
 Marking imputed values
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 2c010e5b1be59..2159e39dc126d 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -43,7 +43,7 @@ Support for Python 3.4 and below has been officially dropped.
 :mod:`sklearn.impute`
 .....................
 
-- |MajorFeature| Added :class:`impute.ChainedImputer`, which is a strategy for
+- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy for
   imputing missing values by modeling each feature with missing values as a
   function of other features in a round-robin fashion. :issue:`8478` by
   :user:`Sergey Feldman <sergeyf>`.
diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py
index 3ab1cfff95576..43d7ddfc497f3 100644
--- a/examples/plot_missing_values.py
+++ b/examples/plot_missing_values.py
@@ -4,11 +4,11 @@
 ====================================================
 
 Missing values can be replaced by the mean, the median or the most frequent
-value using the basic :func:`sklearn.impute.SimpleImputer`.
+value using the basic :class:`sklearn.impute.SimpleImputer`.
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
-Another option is the :func:`sklearn.impute.ChainedImputer`. This uses
+Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
 round-robin linear regression, treating every variable as an output in
 turn. The version implemented assumes Gaussian (output) variables. If your
 features are obviously non-Normal, consider transforming them to look more
@@ -26,7 +26,7 @@
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import make_pipeline, make_union
-from sklearn.impute import SimpleImputer, ChainedImputer, MissingIndicator
+from sklearn.impute import SimpleImputer, IterativeImputer, MissingIndicator
 from sklearn.model_selection import cross_val_score
 
 rng = np.random.RandomState(0)
@@ -73,18 +73,18 @@ def get_results(dataset):
                                          scoring='neg_mean_squared_error',
                                          cv=5)
 
-    # Estimate the score after chained imputation of the missing values
+    # Estimate the score after iterative imputation of the missing values
     estimator = make_pipeline(
-        make_union(ChainedImputer(missing_values=0, random_state=0),
+        make_union(IterativeImputer(missing_values=0, random_state=0),
                    MissingIndicator(missing_values=0)),
         RandomForestRegressor(random_state=0, n_estimators=100))
-    chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
-                                            scoring='neg_mean_squared_error')
+    iterative_impute_scores = cross_val_score(estimator, X_missing, y_missing,
+                                              scoring='neg_mean_squared_error')
 
     return ((full_scores.mean(), full_scores.std()),
             (zero_impute_scores.mean(), zero_impute_scores.std()),
             (mean_impute_scores.mean(), mean_impute_scores.std()),
-            (chained_impute_scores.mean(), chained_impute_scores.std()))
+            (iterative_impute_scores.mean(), iterative_impute_scores.std()))
 
 
 results_diabetes = np.array(get_results(load_diabetes()))
@@ -101,7 +101,7 @@ def get_results(dataset):
 x_labels = ['Full data',
             'Zero imputation',
             'Mean Imputation',
-            'Chained Imputation']
+            'Multivariate Imputation']
 colors = ['r', 'g', 'b', 'orange']
 
 # plot diabetes results
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 89fb33a4f9034..3035040c1179a 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -37,7 +37,7 @@
 __all__ = [
     'MissingIndicator',
     'SimpleImputer',
-    'ChainedImputer',
+    'IterativeImputer',
 ]
 
 
@@ -149,6 +149,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     statistics_ : array of shape (n_features,)
         The imputation fill value for each feature.
 
+    See also
+    --------
+    IterativeImputer : Multivariate imputation of missing values.
+
     Examples
     --------
     >>> import numpy as np
@@ -420,14 +424,13 @@ def transform(self, X):
         return X
 
 
-class ChainedImputer(BaseEstimator, TransformerMixin):
-    """Chained imputer transformer to impute missing values.
+class IterativeImputer(BaseEstimator, TransformerMixin):
+    """Multivariate imputer that estimates each feature from all the others.
 
-    Basic implementation of chained imputer from MICE (Multivariate
-    Imputations by Chained Equations) package from R. This version assumes all
-    of the features are Gaussian.
+    A strategy for imputing missing values by modeling each feature with
+    missing values as a function of other features in a round-robin fashion.
 
-    Read more in the :ref:`User Guide <mice>`.
+    Read more in the :ref:`User Guide <iterative_imputer>`.
 
     Parameters
     ----------
@@ -449,24 +452,34 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         "random"
             A random order for each round.
 
-    n_imputations : int, optional (default=100)
-        Number of chained imputation rounds to perform, the results of which
-        will be used in the final average.
+    n_iter : int, optional (default=10)
+        Number of imputation rounds to perform before returning the imputations
+        computed during the final round. A round is a single imputation of each
+        feature with missing values.
 
-    n_burn_in : int, optional (default=10)
-        Number of initial imputation rounds to perform the results of which
-        will not be returned.
-
-    predictor : estimator object, default=BayesianRidge()
+    predictor : estimator object, default=RidgeCV() or BayesianRidge()
         The predictor to use at each step of the round-robin imputation.
-        It must support ``return_std`` in its ``predict`` method.
+        If ``sample_posterior`` is True, the predictor must support
+        ``return_std`` in its ``predict`` method. Also, if
+        ``sample_posterior=True`` the default predictor will be
+        :class:`sklearn.linear_model.BayesianRidge` and
+        :class:`sklearn.linear_model.RidgeCV` otherwise.
+
+    sample_posterior : boolean, default=False
+        Whether to sample from the (Gaussian) predictive posterior of the
+        fitted predictor for each imputation. Predictor must support
+        ``return_std`` in its ``predict`` method if set to ``True``. Set to
+        ``True`` if using ``IterativeImputer`` for multiple imputations.
 
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
-        the each feature column. Nearness between features is measured using
+        each feature column. Nearness between features is measured using
         the absolute correlation coefficient between each feature pair (after
-        initial imputation). Can provide significant speed-up when the number
-        of features is huge. If ``None``, all features will be used.
+        initial imputation). To ensure coverage of features throughout the
+        imputation process, the neighbor features are not necessarily nearest,
+        but are drawn with probability proportional to correlation for each
+        imputed target feature. Can provide significant speed-up when the
+        number of features is huge. If ``None``, all features will be used.
 
     initial_strategy : str, optional (default="mean")
         Which strategy to use to initialize the missing values. Same as the
@@ -487,37 +500,43 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         or 2.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        The seed of the pseudo random number generator to use when shuffling
-        the data.  If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by ``np.random``.
+        The seed of the pseudo random number generator to use. Randomizes
+        selection of predictor features if n_nearest_features is not None, the
+        ``imputation_order`` if ``random``, and the sampling from posterior if
+        ``sample_posterior`` is True. Use an integer for determinism.
+        See :term:`the Glossary <random_state>`.
 
     Attributes
     ----------
-    initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`'
-        The imputer used to initialize the missing values.
+    initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
+        Imputer used to initialize the missing values.
 
     imputation_sequence_ : list of tuples
         Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where
         ``feat_idx`` is the current feature to be imputed,
         ``neighbor_feat_idx`` is the array of other features used to impute the
         current feature, and ``predictor`` is the trained predictor used for
-        the imputation.
+        the imputation. Length is ``self.n_features_with_missing_ * n_iter``.
+
+    n_features_with_missing_ : int
+        Number of features with missing values.
+
+    See also
+    --------
+    SimpleImputer : Univariate imputation of missing values.
 
     Notes
     -----
-    The R version of MICE does not have inductive functionality, i.e. first
-    fitting on ``X_train`` and then transforming any ``X_test`` without
-    additional fitting. We do this by storing each feature's predictor during
-    the round-robin ``fit`` phase, and predicting without refitting (in order)
-    during the ``transform`` phase.
+    To support imputation in inductive mode we store each feature's predictor
+    during the ``fit`` phase, and predict without refitting (in order) during
+    the ``transform`` phase.
 
     Features which contain all missing values at ``fit`` are discarded upon
     ``transform``.
 
-    Features with missing values in transform which did not have any missing
-    values in fit will be imputed with the initial imputation method only.
+    Features with missing values during ``transform`` which did not have any
+    missing values during ``fit`` will be imputed with the initial imputation
+    method only.
 
     References
     ----------
@@ -525,14 +544,19 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         Multivariate Imputation by Chained Equations in R". Journal of
         Statistical Software 45: 1-67.
         <https://www.jstatsoft.org/article/view/v045i03>`_
+
+    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
+        Multivariate Data Suitable for use with an Electronic Computer".
+        Journal of the Royal Statistical Society 22(2): 302-306.
+        <https://www.jstor.org/stable/2984099>`_
     """
 
     def __init__(self,
                  missing_values=np.nan,
                  imputation_order='ascending',
-                 n_imputations=100,
-                 n_burn_in=10,
+                 n_iter=10,
                  predictor=None,
+                 sample_posterior=False,
                  n_nearest_features=None,
                  initial_strategy="mean",
                  min_value=None,
@@ -542,9 +566,9 @@ def __init__(self,
 
         self.missing_values = missing_values
         self.imputation_order = imputation_order
-        self.n_imputations = n_imputations
-        self.n_burn_in = n_burn_in
+        self.n_iter = n_iter
         self.predictor = predictor
+        self.sample_posterior = sample_posterior
         self.n_nearest_features = n_nearest_features
         self.initial_strategy = initial_strategy
         self.min_value = min_value
@@ -582,7 +606,8 @@ def _impute_one_feature(self,
 
         predictor : object
             The predictor to use at this step of the round-robin imputation.
-            It must support ``return_std`` in its ``predict`` method.
+            If ``sample_posterior`` is True, the predictor must support
+            ``return_std`` in its ``predict`` method.
             If None, it will be cloned from self._predictor.
 
         fit_mode : boolean, default=True
@@ -621,12 +646,15 @@ def _impute_one_feature(self,
         # get posterior samples
         X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
                                missing_row_mask)
-        mus, sigmas = predictor.predict(X_test, return_std=True)
-        good_sigmas = sigmas > 0
-        imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
-        imputed_values[~good_sigmas] = mus[~good_sigmas]
-        imputed_values[good_sigmas] = self.random_state_.normal(
-            loc=mus[good_sigmas], scale=sigmas[good_sigmas])
+        if self.sample_posterior:
+            mus, sigmas = predictor.predict(X_test, return_std=True)
+            good_sigmas = sigmas > 0
+            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+            imputed_values[~good_sigmas] = mus[~good_sigmas]
+            imputed_values[good_sigmas] = self.random_state_.normal(
+                loc=mus[good_sigmas], scale=sigmas[good_sigmas])
+        else:
+            imputed_values = predictor.predict(X_test)
 
         # clip the values
         imputed_values = np.clip(imputed_values,
@@ -822,44 +850,51 @@ def fit_transform(self, X, y=None):
         self.random_state_ = getattr(self, "random_state_",
                                      check_random_state(self.random_state))
 
+        if self.n_iter < 0:
+            raise ValueError(
+                "'n_iter' should be a positive integer. Got {} instead."
+                .format(self.n_iter))
+
         if self.predictor is None:
-            from .linear_model import BayesianRidge
-            self._predictor = BayesianRidge()
+            if self.sample_posterior:
+                from .linear_model import BayesianRidge
+                self._predictor = BayesianRidge()
+            else:
+                from .linear_model import RidgeCV
+                # including a very small alpha to approximate OLS
+                self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1,  1, 10]))
         else:
             self._predictor = clone(self.predictor)
 
+        if hasattr(self._predictor, 'random_state'):
+            self._predictor.random_state = self.random_state_
+
         self._min_value = np.nan if self.min_value is None else self.min_value
         self._max_value = np.nan if self.max_value is None else self.max_value
 
         self.initial_imputer_ = None
-        X, X_filled, mask_missing_values = self._initial_imputation(X)
-
-        # edge case: in case the user specifies 0 for n_imputations,
-        # then there is no need to do burn in and the result should be
-        # just the initial imputation (before clipping)
-        if self.n_imputations < 1:
-            return X_filled
+        X, Xt, mask_missing_values = self._initial_imputation(X)
 
-        X_filled = np.clip(X_filled, self._min_value, self._max_value)
+        if self.n_iter == 0:
+            return Xt
 
         # order in which to impute
         # note this is probably too slow for large feature data (d > 100000)
         # and a better way would be good.
         # see: https://goo.gl/KyCNwj and subsequent comments
         ordered_idx = self._get_ordered_idx(mask_missing_values)
+        self.n_features_with_missing_ = len(ordered_idx)
 
-        abs_corr_mat = self._get_abs_corr_mat(X_filled)
+        abs_corr_mat = self._get_abs_corr_mat(Xt)
 
         # impute data
-        n_rounds = self.n_burn_in + self.n_imputations
-        n_samples, n_features = X_filled.shape
-        Xt = np.zeros((n_samples, n_features), dtype=X.dtype)
+        n_samples, n_features = Xt.shape
         self.imputation_sequence_ = []
         if self.verbose > 0:
-            print("[ChainedImputer] Completing matrix with shape %s"
+            print("[IterativeImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
-        for i_rnd in range(n_rounds):
+        for i_rnd in range(self.n_iter):
             if self.imputation_order == 'random':
                 ordered_idx = self._get_ordered_idx(mask_missing_values)
 
@@ -867,22 +902,19 @@ def fit_transform(self, X, y=None):
                 neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
                                                                 feat_idx,
                                                                 abs_corr_mat)
-                X_filled, predictor = self._impute_one_feature(
-                    X_filled, mask_missing_values, feat_idx, neighbor_feat_idx,
+                Xt, predictor = self._impute_one_feature(
+                    Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
                     predictor=None, fit_mode=True)
                 predictor_triplet = ImputerTriplet(feat_idx,
                                                    neighbor_feat_idx,
                                                    predictor)
                 self.imputation_sequence_.append(predictor_triplet)
 
-            if i_rnd >= self.n_burn_in:
-                Xt += X_filled
             if self.verbose > 0:
-                print('[ChainedImputer] Ending imputation round '
+                print('[IterativeImputer] Ending imputation round '
                       '%d/%d, elapsed time %0.2f'
-                      % (i_rnd + 1, n_rounds, time() - start_t))
+                      % (i_rnd + 1, self.n_iter, time() - start_t))
 
-        Xt /= self.n_imputations
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
@@ -904,28 +936,20 @@ def transform(self, X):
         """
         check_is_fitted(self, 'initial_imputer_')
 
-        X, X_filled, mask_missing_values = self._initial_imputation(X)
-
-        # edge case: in case the user specifies 0 for n_imputations,
-        # then there is no need to do burn in and the result should be
-        # just the initial imputation (before clipping)
-        if self.n_imputations < 1:
-            return X_filled
+        X, Xt, mask_missing_values = self._initial_imputation(X)
 
-        X_filled = np.clip(X_filled, self._min_value, self._max_value)
+        if self.n_iter == 0:
+            return Xt
 
-        n_rounds = self.n_burn_in + self.n_imputations
-        n_imputations = len(self.imputation_sequence_)
-        imputations_per_round = n_imputations // n_rounds
+        imputations_per_round = len(self.imputation_sequence_) // self.n_iter
         i_rnd = 0
-        Xt = np.zeros(X.shape, dtype=X.dtype)
         if self.verbose > 0:
-            print("[ChainedImputer] Completing matrix with shape %s"
+            print("[IterativeImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
         for it, predictor_triplet in enumerate(self.imputation_sequence_):
-            X_filled, _ = self._impute_one_feature(
-                X_filled,
+            Xt, _ = self._impute_one_feature(
+                Xt,
                 mask_missing_values,
                 predictor_triplet.feat_idx,
                 predictor_triplet.neighbor_feat_idx,
@@ -933,15 +957,12 @@ def transform(self, X):
                 fit_mode=False
             )
             if not (it + 1) % imputations_per_round:
-                if i_rnd >= self.n_burn_in:
-                    Xt += X_filled
                 if self.verbose > 1:
-                    print('[ChainedImputer] Ending imputation round '
+                    print('[IterativeImputer] Ending imputation round '
                           '%d/%d, elapsed time %0.2f'
-                          % (i_rnd + 1, n_rounds, time() - start_t))
+                          % (i_rnd + 1, self.n_iter, time() - start_t))
                 i_rnd += 1
 
-        Xt /= self.n_imputations
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index f9c3e4902f145..dd246cc3e8c4d 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -14,9 +14,9 @@
 from sklearn.utils.testing import assert_false
 
 from sklearn.impute import MissingIndicator
-from sklearn.impute import SimpleImputer, ChainedImputer
+from sklearn.impute import SimpleImputer, IterativeImputer
 from sklearn.dummy import DummyRegressor
-from sklearn.linear_model import BayesianRidge, ARDRegression
+from sklearn.linear_model import BayesianRidge, ARDRegression, RidgeCV
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn import tree
@@ -73,8 +73,8 @@ def test_imputation_shape():
         X_imputed = imputer.fit_transform(X)
         assert X_imputed.shape == (10, 2)
 
-        chained_imputer = ChainedImputer(initial_strategy=strategy)
-        X_imputed = chained_imputer.fit_transform(X)
+        iterative_imputer = IterativeImputer(initial_strategy=strategy)
+        X_imputed = iterative_imputer.fit_transform(X)
         assert X_imputed.shape == (10, 2)
 
 
@@ -508,46 +508,31 @@ def test_imputation_copy():
     # made, even if copy=False.
 
 
-def test_chained_imputer_rank_one():
-    rng = np.random.RandomState(0)
-    d = 100
-    A = rng.rand(d, 1)
-    B = rng.rand(1, d)
-    X = np.dot(A, B)
-    nan_mask = rng.rand(d, d) < 0.5
-    X_missing = X.copy()
-    X_missing[nan_mask] = np.nan
-
-    imputer = ChainedImputer(n_imputations=5,
-                             n_burn_in=5,
-                             verbose=True,
-                             random_state=rng)
-    X_filled = imputer.fit_transform(X_missing)
-    assert_allclose(X_filled, X, atol=0.001)
-
-
 @pytest.mark.parametrize(
     "imputation_order",
     ['random', 'roman', 'ascending', 'descending', 'arabic']
 )
-def test_chained_imputer_imputation_order(imputation_order):
+def test_iterative_imputer_imputation_order(imputation_order):
     rng = np.random.RandomState(0)
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
-    X[:, 0] = 1  # this column should not be discarded by ChainedImputer
-
-    imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
-                             n_nearest_features=5,
-                             min_value=0,
-                             max_value=1,
-                             verbose=False,
-                             imputation_order=imputation_order,
-                             random_state=rng)
+    X[:, 0] = 1  # this column should not be discarded by IterativeImputer
+
+    n_iter = 2
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=n_iter,
+                               n_nearest_features=5,
+                               min_value=0,
+                               max_value=1,
+                               verbose=False,
+                               imputation_order=imputation_order,
+                               random_state=rng)
     imputer.fit_transform(X)
     ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
+
+    assert len(ordered_idx) // n_iter == imputer.n_features_with_missing_
+
     if imputation_order == 'roman':
         assert np.all(ordered_idx[:d-1] == np.arange(1, d))
     elif imputation_order == 'arabic':
@@ -557,25 +542,24 @@ def test_chained_imputer_imputation_order(imputation_order):
         ordered_idx_round_2 = ordered_idx[d-1:]
         assert ordered_idx_round_1 != ordered_idx_round_2
     elif 'ending' in imputation_order:
-        assert len(ordered_idx) == 2 * (d - 1)
+        assert len(ordered_idx) == n_iter * (d - 1)
 
 
 @pytest.mark.parametrize(
     "predictor",
-    [DummyRegressor(), BayesianRidge(), ARDRegression()]
+    [DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()]
 )
-def test_chained_imputer_predictors(predictor):
+def test_iterative_imputer_predictors(predictor):
     rng = np.random.RandomState(0)
 
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
-                             predictor=predictor,
-                             random_state=rng)
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               predictor=predictor,
+                               random_state=rng)
     imputer.fit_transform(X)
 
     # check that types are correct for predictors
@@ -588,19 +572,18 @@ def test_chained_imputer_predictors(predictor):
     assert len(set(hashes)) == len(hashes)
 
 
-def test_chained_imputer_clip():
+def test_iterative_imputer_clip():
     rng = np.random.RandomState(0)
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10,
                              random_state=rng).toarray()
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
-                             min_value=0.1,
-                             max_value=0.2,
-                             random_state=rng)
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               min_value=0.1,
+                               max_value=0.2,
+                               random_state=rng)
 
     Xt = imputer.fit_transform(X)
     assert_allclose(np.min(Xt[X == 0]), 0.1)
@@ -612,7 +595,7 @@ def test_chained_imputer_clip():
     "strategy",
     ["mean", "median", "most_frequent"]
 )
-def test_chained_imputer_missing_at_transform(strategy):
+def test_iterative_imputer_missing_at_transform(strategy):
     rng = np.random.RandomState(0)
     n = 100
     d = 10
@@ -622,11 +605,10 @@ def test_chained_imputer_missing_at_transform(strategy):
     X_train[:, 0] = 1  # definitely no missing values in 0th column
     X_test[0, 0] = 0  # definitely missing value in 0th column
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
-                             initial_strategy=strategy,
-                             random_state=rng).fit(X_train)
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               initial_strategy=strategy,
+                               random_state=rng).fit(X_train)
     initial_imputer = SimpleImputer(missing_values=0,
                                     strategy=strategy).fit(X_train)
 
@@ -636,17 +618,19 @@ def test_chained_imputer_missing_at_transform(strategy):
                   initial_imputer.transform(X_test)[:, 0])
 
 
-def test_chained_imputer_transform_stochasticity():
-    rng = np.random.RandomState(0)
+def test_iterative_imputer_transform_stochasticity():
+    rng1 = np.random.RandomState(0)
+    rng2 = np.random.RandomState(1)
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10,
-                             random_state=rng).toarray()
+                             random_state=rng1).toarray()
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
-                             random_state=rng)
+    # when sample_posterior=True, two transforms shouldn't be equal
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               sample_posterior=True,
+                               random_state=rng1)
     imputer.fit(X)
 
     X_fitted_1 = imputer.transform(X)
@@ -655,13 +639,39 @@ def test_chained_imputer_transform_stochasticity():
     # sufficient to assert that the means are not the same
     assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
 
-
-def test_chained_imputer_no_missing():
+    # when sample_posterior=False, and n_nearest_features=None
+    # and imputation_order is not random
+    # the two transforms should be identical even if rng are different
+    imputer1 = IterativeImputer(missing_values=0,
+                                n_iter=1,
+                                sample_posterior=False,
+                                n_nearest_features=None,
+                                imputation_order='ascending',
+                                random_state=rng1)
+
+    imputer2 = IterativeImputer(missing_values=0,
+                                n_iter=1,
+                                sample_posterior=False,
+                                n_nearest_features=None,
+                                imputation_order='ascending',
+                                random_state=rng2)
+    imputer1.fit(X)
+    imputer2.fit(X)
+
+    X_fitted_1a = imputer1.transform(X)
+    X_fitted_1b = imputer1.transform(X)
+    X_fitted_2 = imputer2.transform(X)
+
+    assert np.all(X_fitted_1a == X_fitted_1b)
+    assert np.all(X_fitted_1a == X_fitted_2)
+
+
+def test_iterative_imputer_no_missing():
     rng = np.random.RandomState(0)
     X = rng.rand(100, 100)
     X[:, 0] = np.nan
-    m1 = ChainedImputer(n_imputations=10, random_state=rng)
-    m2 = ChainedImputer(n_imputations=10, random_state=rng)
+    m1 = IterativeImputer(n_iter=10, random_state=rng)
+    m2 = IterativeImputer(n_iter=10, random_state=rng)
     pred1 = m1.fit(X).transform(X)
     pred2 = m2.fit_transform(X)
     # should exclude the first column entirely
@@ -670,11 +680,28 @@ def test_chained_imputer_no_missing():
     assert_allclose(pred1, pred2)
 
 
+def test_iterative_imputer_rank_one():
+    rng = np.random.RandomState(0)
+    d = 100
+    A = rng.rand(d, 1)
+    B = rng.rand(1, d)
+    X = np.dot(A, B)
+    nan_mask = rng.rand(d, d) < 0.5
+    X_missing = X.copy()
+    X_missing[nan_mask] = np.nan
+
+    imputer = IterativeImputer(n_iter=5,
+                               verbose=1,
+                               random_state=rng)
+    X_filled = imputer.fit_transform(X_missing)
+    assert_allclose(X_filled, X, atol=0.01)
+
+
 @pytest.mark.parametrize(
     "rank",
     [3, 5]
 )
-def test_chained_imputer_transform_recovery(rank):
+def test_iterative_imputer_transform_recovery(rank):
     rng = np.random.RandomState(0)
     n = 100
     d = 100
@@ -692,15 +719,14 @@ def test_chained_imputer_transform_recovery(rank):
     X_test_filled = X_filled[n:]
     X_test = X_missing[n:]
 
-    imputer = ChainedImputer(n_imputations=10,
-                             n_burn_in=10,
-                             verbose=True,
-                             random_state=rng).fit(X_train)
+    imputer = IterativeImputer(n_iter=10,
+                               verbose=1,
+                               random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
-    assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
+    assert_allclose(X_test_filled, X_test_est, atol=0.1)
 
 
-def test_chained_imputer_additive_matrix():
+def test_iterative_imputer_additive_matrix():
     rng = np.random.RandomState(0)
     n = 100
     d = 10
@@ -721,14 +747,20 @@ def test_chained_imputer_additive_matrix():
     X_test_filled = X_filled[n:]
     X_test = X_missing[n:]
 
-    imputer = ChainedImputer(n_imputations=25,
-                             n_burn_in=10,
-                             verbose=True,
-                             random_state=rng).fit(X_train)
+    imputer = IterativeImputer(n_iter=10,
+                               verbose=2,
+                               random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
     assert_allclose(X_test_filled, X_test_est, atol=0.01)
 
 
+def test_iterative_imputer_error_param():
+    rng = np.random.RandomState(42)
+    X = rng.randn(100, 2)
+    imputer = IterativeImputer(n_iter=-1)
+    with pytest.raises(ValueError, match='should be a positive integer'):
+        imputer.fit_transform(X)
+
 @pytest.mark.parametrize(
     "X_fit, X_trans, params, msg_err",
     [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]),
@@ -845,7 +877,7 @@ def test_missing_indicator_sparse_param(arr_type, missing_values,
 
 
 @pytest.mark.parametrize("imputer_constructor",
-                         [SimpleImputer, ChainedImputer])
+                         [SimpleImputer, IterativeImputer])
 @pytest.mark.parametrize(
     "imputer_missing_values, missing_value, err_msg",
     [("NaN", np.nan, "Input contains NaN"),
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 931e50d920402..a59c1b8cd6e6b 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -77,7 +77,8 @@
                 'RANSACRegressor', 'RadiusNeighborsRegressor',
                 'RandomForestRegressor', 'Ridge', 'RidgeCV']
 
-ALLOW_NAN = ['Imputer', 'SimpleImputer', 'ChainedImputer', 'MissingIndicator',
+ALLOW_NAN = ['Imputer', 'SimpleImputer', 'IterativeImputer',
+             'MissingIndicator',
              'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
              'PowerTransformer', 'QuantileTransformer']
 

From 09a9a21ba1be7b1ef0722213a475bbabad47bf9f Mon Sep 17 00:00:00 2001
From: Ben Lawson <balawson@bu.edu>
Date: Fri, 5 Oct 2018 16:01:34 -0400
Subject: [PATCH 04/20] [MRG] sample from a truncated normal instead of
 clipping samples from a normal (#12177)

---
 doc/whats_new/v0.21.rst      |  4 ++++
 sklearn/impute.py            | 22 +++++++++++++---------
 sklearn/tests/test_impute.py | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 2159e39dc126d..0cc7be8e7a6aa 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -48,6 +48,10 @@ Support for Python 3.4 and below has been officially dropped.
   function of other features in a round-robin fashion. :issue:`8478` by
   :user:`Sergey Feldman <sergeyf>`.
 
+- |Enhancement| :class:`impute.IterativeImputer` now samples from a truncated normal
+  distribution instead of a clipped normal distribution when ``sample_posterior=True``.
+  :issue:`12177` by :user:`Ben Lawson <benlawson>`.
+
 Multiple modules
 ................
 
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 3035040c1179a..e6b0181251071 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -651,15 +651,19 @@ def _impute_one_feature(self,
             good_sigmas = sigmas > 0
             imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
             imputed_values[~good_sigmas] = mus[~good_sigmas]
-            imputed_values[good_sigmas] = self.random_state_.normal(
-                loc=mus[good_sigmas], scale=sigmas[good_sigmas])
+            a = (self._min_value - mus) / sigmas
+            b = (self._max_value - mus) / sigmas
+            truncated_normal = stats.truncnorm(a=a,
+                                               b=b,
+                                               loc=mus[good_sigmas],
+                                               scale=sigmas[good_sigmas])
+            imputed_values[good_sigmas] = truncated_normal.rvs(
+                random_state=self.random_state_)
         else:
             imputed_values = predictor.predict(X_test)
-
-        # clip the values
-        imputed_values = np.clip(imputed_values,
-                                 self._min_value,
-                                 self._max_value)
+            imputed_values = np.clip(imputed_values,
+                                     self._min_value,
+                                     self._max_value)
 
         # update the feature
         X_filled[missing_row_mask, feat_idx] = imputed_values
@@ -869,8 +873,8 @@ def fit_transform(self, X, y=None):
         if hasattr(self._predictor, 'random_state'):
             self._predictor.random_state = self.random_state_
 
-        self._min_value = np.nan if self.min_value is None else self.min_value
-        self._max_value = np.nan if self.max_value is None else self.max_value
+        self._min_value = -np.inf if self.min_value is None else self.min_value
+        self._max_value = np.inf if self.max_value is None else self.max_value
 
         self.initial_imputer_ = None
         X, Xt, mask_missing_values = self._initial_imputation(X)
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index dd246cc3e8c4d..029864164a69f 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from scipy import sparse
+from scipy.stats import kstest
 
 import io
 
@@ -591,6 +592,39 @@ def test_iterative_imputer_clip():
     assert_allclose(Xt[X != 0], X[X != 0])
 
 
+def test_iterative_imputer_truncated_normal_posterior():
+    #  test that the values that are imputed using `sample_posterior=True`
+    #  with boundaries (`min_value` and `max_value` are not None) are drawn
+    #  from a distribution that looks gaussian via the Kolmogorov Smirnov test
+    pytest.importorskip("scipy", minversion="0.17.0")
+    rng = np.random.RandomState(0)
+
+    X = rng.normal(size=(5, 5))
+    X[0][0] = np.nan
+
+    imputer = IterativeImputer(min_value=0,
+                               max_value=0.5,
+                               sample_posterior=True,
+                               random_state=rng)
+
+    imputer.fit_transform(X)
+    # generate multiple imputations for the single missing value
+    imputations = np.array([imputer.transform(X)[0][0] for _ in range(1000)])
+
+    assert all(imputations >= 0)
+    assert all(imputations <= 0.5)
+
+    mu, sigma = imputations.mean(), imputations.std()
+    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
+    if sigma == 0:
+        sigma += 1e-12
+    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
+    # we want to fail to reject null hypothesis
+    # null hypothesis: distributions are the same
+    assert ks_statistic < 0.2 or p_value > 0.1, \
+        "The posterior does appear to be normal"
+
+
 @pytest.mark.parametrize(
     "strategy",
     ["mean", "median", "most_frequent"]
@@ -619,6 +653,7 @@ def test_iterative_imputer_missing_at_transform(strategy):
 
 
 def test_iterative_imputer_transform_stochasticity():
+    pytest.importorskip("scipy", minversion="0.17.0")
     rng1 = np.random.RandomState(0)
     rng2 = np.random.RandomState(1)
     n = 100
@@ -761,6 +796,7 @@ def test_iterative_imputer_error_param():
     with pytest.raises(ValueError, match='should be a positive integer'):
         imputer.fit_transform(X)
 
+
 @pytest.mark.parametrize(
     "X_fit, X_trans, params, msg_err",
     [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]),

From caa089e79451a14a332f997a03ed7fec9602eaa1 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Mon, 8 Oct 2018 09:01:06 +1100
Subject: [PATCH 05/20] DOC Merge IterativeImputer what's news

---
 doc/whats_new/v0.21.rst | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 5344dccdbce28..cb03c7eba027b 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -43,14 +43,11 @@ Support for Python 3.4 and below has been officially dropped.
 :mod:`sklearn.impute`
 .....................
 
-- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy for
-  imputing missing values by modeling each feature with missing values as a
-  function of other features in a round-robin fashion. :issue:`8478` by
-  :user:`Sergey Feldman <sergeyf>`.
-
-- |Enhancement| :class:`impute.IterativeImputer` now samples from a truncated normal
-  distribution instead of a clipped normal distribution when ``sample_posterior=True``.
-  :issue:`12177` by :user:`Ben Lawson <benlawson>`.
+- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy
+  for imputing missing values by modeling each feature with missing values as a
+  function of other features in a round-robin fashion. :issue:`8478` and
+  :issue:`12177` by :user:`Sergey Feldman <sergeyf>` :user:`Ben Lawson
+  <benlawson>`.
 
 :mod:`sklearn.cluster`
 ......................

From f103c6be7a0848303a2a8cbb8bbc74d37ace3b15 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 17 Jan 2019 09:04:09 +1100
Subject: [PATCH 06/20] Undo changes to v0.20.rst

---
 doc/whats_new/v0.20.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index e71e6843106a9..acd54575fd9bd 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -54,6 +54,7 @@ random sampling procedures.
 Changelog
 ---------
 
+
 :mod:`sklearn.compose`
 ......................
 

From 9e106580a7147a0a65ce142f84700a278ebcad9a Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 17 Jan 2019 09:07:34 +1100
Subject: [PATCH 07/20] Revert changes to v0.20.rst

---
 doc/whats_new/v0.20.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index acd54575fd9bd..b4e2d65c0ad87 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -54,7 +54,6 @@ random sampling procedures.
 Changelog
 ---------
 
-
 :mod:`sklearn.compose`
 ......................
 
@@ -495,6 +494,7 @@ Support for Python 3.3 has been officially dropped.
   :class:`cluster.AgglomerativeClustering`.
   :issue:`9875` by :user:`Kumar Ashutosh <thechargedneutron>`.
 
+
 :mod:`sklearn.compose`
 ......................
 

From 0aab6dc73f5284057233bfd91f1c39f80db1ddd9 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 17 Jan 2019 10:56:38 +1100
Subject: [PATCH 08/20] DOC Normalize whitespace in doctest

---
 doc/modules/impute.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index a8164f27efbea..62cbe74c4ea97 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -106,11 +106,11 @@ round are returned.
     >>> import numpy as np
     >>> from sklearn.impute import IterativeImputer
     >>> imp = IterativeImputer(n_iter=10, random_state=0)
-    >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
+    >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
     IterativeImputer(imputation_order='ascending', initial_strategy='mean',
-             max_value=None, min_value=None, missing_values=nan, n_iter=10,
-             n_nearest_features=None, predictor=None, random_state=0,
-             sample_posterior=False, verbose=False)
+        max_value=None, min_value=None, missing_values=nan, n_iter=10,
+        n_nearest_features=None, predictor=None, random_state=0,
+        sample_posterior=False, verbose=False)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]

From d34f2270b73b8c8238789e1084d73f84f3bcc7a6 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 17 Jan 2019 13:01:54 +1100
Subject: [PATCH 09/20] Fix for SciPy 0.17

---
 sklearn/impute.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 411bc78acec8d..f888d7fe83d4f 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -8,9 +8,11 @@
 import warnings
 import numbers
 from time import time
+from distutils.version import LooseVersion
 
 import numpy as np
 import numpy.ma as ma
+import scipy
 from scipy import sparse
 from scipy import stats
 from collections import namedtuple
@@ -647,14 +649,24 @@ def _impute_one_feature(self,
             good_sigmas = sigmas > 0
             imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
             imputed_values[~good_sigmas] = mus[~good_sigmas]
+            mus = mus[good_sigmas]
+            sigmas = sigmas[good_sigmas]
             a = (self._min_value - mus) / sigmas
             b = (self._max_value - mus) / sigmas
-            truncated_normal = stats.truncnorm(a=a,
-                                               b=b,
-                                               loc=mus[good_sigmas],
-                                               scale=sigmas[good_sigmas])
-            imputed_values[good_sigmas] = truncated_normal.rvs(
-                random_state=self.random_state_)
+
+            if scipy.__version__ < LooseVersion('0.18'):
+                # bug with vector-valued `a` in old scipy
+                imputed_values[good_sigmas] = [
+                    stats.truncnorm(a=a_, b=b_,
+                                    loc=loc_, scale=scale_).rvs(
+                                        random_state=self.random_state_)
+                    for a_, b_, loc_, scale_
+                    in zip(a, b, mus, sigmas)]
+            else:
+                truncated_normal = stats.truncnorm(a=a, b=b,
+                                                   loc=mus, scale=sigmas)
+                imputed_values[good_sigmas] = truncated_normal.rvs(
+                    random_state=self.random_state_)
         else:
             imputed_values = predictor.predict(X_test)
             imputed_values = np.clip(imputed_values,

From b44dff8a086f37a960c8219ccd4694d1c87632eb Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 17 Jan 2019 18:05:05 +1100
Subject: [PATCH 10/20] Fix doctest

---
 doc/modules/impute.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 62cbe74c4ea97..3b029c4d15751 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -115,7 +115,7 @@ round are returned.
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]
      [ 6.  3.]
-     [24.  6.]]
+     [26.  6.]]
 
 Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline
 as a way to build a composite estimator that supports imputation.

From 0453c19ede55a69f8c706c6e251cc2cd7fdb75a2 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 22 Jan 2019 13:46:01 +1100
Subject: [PATCH 11/20] Create examples/impute gallery

---
 build_tools/circle/build_doc.sh              | 2 +-
 doc/modules/impute.rst                       | 2 +-
 examples/{ => impute}/plot_missing_values.py | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename examples/{ => impute}/plot_missing_values.py (100%)

diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index d32f7b9000b95..363148817c61c 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -143,7 +143,7 @@ cd -
 set +o pipefail
 
 affected_doc_paths() {
-    files=$(git diff --name-only origin/master...$CIRCLE_SHA1)
+    files=$(git diff --name-only origin/"$CIRCLE_BRANCH"...$CIRCLE_SHA1)
     echo "$files" | grep ^doc/.*\.rst | sed 's/^doc\/\(.*\)\.rst$/\1.html/'
     echo "$files" | grep ^examples/.*.py | sed 's/^\(.*\)\.py$/auto_\1.html/'
     sklearn_files=$(echo "$files" | grep '^sklearn/')
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 3b029c4d15751..1d1f6e926e8f8 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -119,7 +119,7 @@ round are returned.
 
 Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline
 as a way to build a composite estimator that supports imputation.
-See :ref:`sphx_glr_auto_examples_plot_missing_values.py`.
+See :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`.
 
 .. _multiple_imputation:
 
diff --git a/examples/plot_missing_values.py b/examples/impute/plot_missing_values.py
similarity index 100%
rename from examples/plot_missing_values.py
rename to examples/impute/plot_missing_values.py

From 87585614aac4c7ab1a1e45fefc093eb805a2653e Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 22 Jan 2019 15:58:13 +1100
Subject: [PATCH 12/20] Add missing readme file

---
 examples/impute/README.txt | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 examples/impute/README.txt

diff --git a/examples/impute/README.txt b/examples/impute/README.txt
new file mode 100644
index 0000000000000..e42264caf9087
--- /dev/null
+++ b/examples/impute/README.txt
@@ -0,0 +1,6 @@
+.. _impute_examples:
+
+Missing Value Imputation
+------------------------
+
+Examples concerning the :mod:`sklearn.impute` module.

From f4d970ec9ef287d945e2cf16317b08b897738172 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 22 Jan 2019 15:58:46 +1100
Subject: [PATCH 13/20] Undo change to circle build

---
 build_tools/circle/build_doc.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index 363148817c61c..d32f7b9000b95 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -143,7 +143,7 @@ cd -
 set +o pipefail
 
 affected_doc_paths() {
-    files=$(git diff --name-only origin/"$CIRCLE_BRANCH"...$CIRCLE_SHA1)
+    files=$(git diff --name-only origin/master...$CIRCLE_SHA1)
     echo "$files" | grep ^doc/.*\.rst | sed 's/^doc\/\(.*\)\.rst$/\1.html/'
     echo "$files" | grep ^examples/.*.py | sed 's/^\(.*\)\.py$/auto_\1.html/'
     sklearn_files=$(echo "$files" | grep '^sklearn/')

From 34b7a4648358c2d7a10095c439d6bde7983d6d83 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 24 Jan 2019 09:56:45 +1100
Subject: [PATCH 14/20] DOC Make IterativeImputer doctest more stable (#13026)

---
 doc/modules/impute.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 1d1f6e926e8f8..45523d74fe9b8 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -106,16 +106,17 @@ round are returned.
     >>> import numpy as np
     >>> from sklearn.impute import IterativeImputer
     >>> imp = IterativeImputer(n_iter=10, random_state=0)
-    >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
+    >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
     IterativeImputer(imputation_order='ascending', initial_strategy='mean',
         max_value=None, min_value=None, missing_values=nan, n_iter=10,
         n_nearest_features=None, predictor=None, random_state=0,
         sample_posterior=False, verbose=False)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
+    >>> # the model learns that the second feature is double the first
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]
-     [ 6.  3.]
-     [26.  6.]]
+     [ 6. 12.]
+     [ 3.  6.]]
 
 Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline
 as a way to build a composite estimator that supports imputation.

From b58bd0b1ef7c6fd4bd5b4947bdbf2a91b75bb9d1 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 24 Jan 2019 12:08:10 +1100
Subject: [PATCH 15/20] TST IterativeImputer: Check predictor type (#13039)

---
 sklearn/tests/test_impute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index fd2bbd4ec5ad0..3f347edd00e3e 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -565,7 +565,7 @@ def test_iterative_imputer_predictors(predictor):
     # check that types are correct for predictors
     hashes = []
     for triplet in imputer.imputation_sequence_:
-        assert triplet.predictor
+        assert isinstance(triplet.predictor, type(predictor))
         hashes.append(id(triplet.predictor))
 
     # check that each predictor is unique

From cf4670c23ae00725e6efa8c0283311ccb631e28e Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Thu, 24 Jan 2019 08:59:07 -0800
Subject: [PATCH 16/20] EHN: Changing default model for IterativeImputer to
 BayesianRidge (#13038)

---
 sklearn/impute.py            | 16 ++++------------
 sklearn/tests/test_impute.py |  6 ++++--
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index f888d7fe83d4f..6dfce49f7b1f2 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -455,13 +455,10 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         computed during the final round. A round is a single imputation of each
         feature with missing values.
 
-    predictor : estimator object, default=RidgeCV() or BayesianRidge()
+    predictor : estimator object, default=BayesianRidge()
         The predictor to use at each step of the round-robin imputation.
         If ``sample_posterior`` is True, the predictor must support
-        ``return_std`` in its ``predict`` method. Also, if
-        ``sample_posterior=True`` the default predictor will be
-        :class:`sklearn.linear_model.BayesianRidge` and
-        :class:`sklearn.linear_model.RidgeCV` otherwise.
+        ``return_std`` in its ``predict`` method.
 
     sample_posterior : boolean, default=False
         Whether to sample from the (Gaussian) predictive posterior of the
@@ -868,13 +865,8 @@ def fit_transform(self, X, y=None):
                 .format(self.n_iter))
 
         if self.predictor is None:
-            if self.sample_posterior:
-                from .linear_model import BayesianRidge
-                self._predictor = BayesianRidge()
-            else:
-                from .linear_model import RidgeCV
-                # including a very small alpha to approximate OLS
-                self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1,  1, 10]))
+            from .linear_model import BayesianRidge
+            self._predictor = BayesianRidge()
         else:
             self._predictor = clone(self.predictor)
 
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 3f347edd00e3e..a2bf8d75ef9e5 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -547,7 +547,7 @@ def test_iterative_imputer_imputation_order(imputation_order):
 
 @pytest.mark.parametrize(
     "predictor",
-    [DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()]
+    [None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()]
 )
 def test_iterative_imputer_predictors(predictor):
     rng = np.random.RandomState(0)
@@ -565,7 +565,9 @@ def test_iterative_imputer_predictors(predictor):
     # check that types are correct for predictors
     hashes = []
     for triplet in imputer.imputation_sequence_:
-        assert isinstance(triplet.predictor, type(predictor))
+        expected_type = (type(predictor) if predictor is not None
+                         else type(BayesianRidge()))
+        assert isinstance(triplet.predictor, expected_type)
         hashes.append(id(triplet.predictor))
 
     # check that each predictor is unique

From dc304a4e16ff782eec52ee036c88d91437c740a2 Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Thu, 24 Jan 2019 18:50:52 -0800
Subject: [PATCH 17/20] EXA Add IterativeImputer extended example (#12100)

---
 doc/modules/impute.rst                        |  70 ++++++----
 ...t_iterative_imputer_variants_comparison.py | 126 ++++++++++++++++++
 examples/impute/plot_missing_values.py        |  58 ++++----
 sklearn/impute.py                             |   2 +-
 4 files changed, 200 insertions(+), 56 deletions(-)
 create mode 100644 examples/impute/plot_iterative_imputer_variants_comparison.py

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 45523d74fe9b8..1db20e9c6dcdb 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -9,19 +9,19 @@ Imputation of missing values
 For various reasons, many real world datasets contain missing values, often
 encoded as blanks, NaNs or other placeholders. Such datasets however are
 incompatible with scikit-learn estimators which assume that all values in an
-array are numerical, and that all have and hold meaning. A basic strategy to use
-incomplete datasets is to discard entire rows and/or columns containing missing
-values. However, this comes at the price of losing data which may be valuable
-(even though incomplete). A better strategy is to impute the missing values,
-i.e., to infer them from the known part of the data. See the :ref:`glossary`
-entry on imputation.
+array are numerical, and that all have and hold meaning. A basic strategy to
+use incomplete datasets is to discard entire rows and/or columns containing
+missing values. However, this comes at the price of losing data which may be
+valuable (even though incomplete). A better strategy is to impute the missing
+values, i.e., to infer them from the known part of the data. See the
+:ref:`glossary` entry on imputation.
 
 
 Univariate vs. Multivariate Imputation
 ======================================
 
-One type of imputation algorithm is univariate, which imputes values in the i-th
-feature dimension using only non-missing values in that feature dimension
+One type of imputation algorithm is univariate, which imputes values in the
+i-th feature dimension using only non-missing values in that feature dimension
 (e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation
 algorithms use the entire set of available feature dimensions to estimate the
 missing values (e.g. :class:`impute.IterativeImputer`).
@@ -66,9 +66,9 @@ The :class:`SimpleImputer` class also supports sparse matrices::
      [6. 3.]
      [7. 6.]]
 
-Note that this format is not meant to be used to implicitly store missing values
-in the matrix because it would densify it at transform time. Missing values encoded
-by 0 must be used with dense input.
+Note that this format is not meant to be used to implicitly store missing
+values in the matrix because it would densify it at transform time. Missing
+values encoded by 0 must be used with dense input.
 
 The :class:`SimpleImputer` class also supports categorical data represented as
 string values or pandas categoricals when using the ``'most_frequent'`` or
@@ -110,7 +110,7 @@ round are returned.
     IterativeImputer(imputation_order='ascending', initial_strategy='mean',
         max_value=None, min_value=None, missing_values=nan, n_iter=10,
         n_nearest_features=None, predictor=None, random_state=0,
-        sample_posterior=False, verbose=False)
+        sample_posterior=False, verbose=0)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     >>> # the model learns that the second feature is double the first
     >>> print(np.round(imp.transform(X_test)))
@@ -118,23 +118,35 @@ round are returned.
      [ 6. 12.]
      [ 3.  6.]]
 
-Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline
-as a way to build a composite estimator that supports imputation.
+Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a
+Pipeline as a way to build a composite estimator that supports imputation.
 See :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`.
 
+Flexibility of IterativeImputer
+-------------------------------
+
+There are many well-established imputation packages in the R data science
+ecosystem: Amelia, mi, mice, missForest, etc. missForest is popular, and turns
+out to be a particular instance of different sequential imputation algorithms
+that can all be implemented with :class:`IterativeImputer` by passing in
+different regressors to be used for predicting missing feature values. In the
+case of missForest, this regressor is a Random Forest.
+See :ref:`sphx_glr_auto_examples_plot_iterative_imputer_variants_comparison.py`.
+
+
 .. _multiple_imputation:
 
 Multiple vs. Single Imputation
-==============================
+------------------------------
 
-In the statistics community, it is common practice to perform multiple imputations,
-generating, for example, ``m`` separate imputations for a single feature matrix.
-Each of these ``m`` imputations is then put through the subsequent analysis pipeline
-(e.g. feature engineering, clustering, regression, classification). The ``m`` final
-analysis results (e.g. held-out validation errors) allow the data scientist
-to obtain understanding of how analytic results may differ as a consequence
-of the inherent uncertainty caused by the missing values. The above practice
-is called multiple imputation.
+In the statistics community, it is common practice to perform multiple
+imputations, generating, for example, ``m`` separate imputations for a single
+feature matrix. Each of these ``m`` imputations is then put through the
+subsequent analysis pipeline (e.g. feature engineering, clustering, regression,
+classification). The ``m`` final analysis results (e.g. held-out validation
+errors) allow the data scientist to obtain understanding of how analytic
+results may differ as a consequence of the inherent uncertainty caused by the
+missing values. The above practice is called multiple imputation.
 
 Our implementation of :class:`IterativeImputer` was inspired by the R MICE
 package (Multivariate Imputation by Chained Equations) [1]_, but differs from
@@ -144,13 +156,13 @@ it repeatedly to the same dataset with different random seeds when
 ``sample_posterior=True``. See [2]_, chapter 4 for more discussion on multiple
 vs. single imputations.
 
-It is still an open problem as to how useful single vs. multiple imputation is in
-the context of prediction and classification when the user is not interested in
-measuring uncertainty due to missing values.
+It is still an open problem as to how useful single vs. multiple imputation is
+in the context of prediction and classification when the user is not
+interested in measuring uncertainty due to missing values.
 
-Note that a call to the ``transform`` method of :class:`IterativeImputer` is not
-allowed to change the number of samples. Therefore multiple imputations cannot be
-achieved by a single call to ``transform``.
+Note that a call to the ``transform`` method of :class:`IterativeImputer` is
+not allowed to change the number of samples. Therefore multiple imputations
+cannot be achieved by a single call to ``transform``.
 
 References
 ==========
diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
new file mode 100644
index 0000000000000..a850deb273f24
--- /dev/null
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -0,0 +1,126 @@
+"""
+=========================================================
+Imputing missing values with variants of IterativeImputer
+=========================================================
+
+The :class:`sklearn.impute.IterativeImputer` class is very flexible - it can be
+used with a variety of predictors to do round-robin regression, treating every
+variable as an output in turn.
+
+In this example we compare some predictors for the purpose of missing feature
+imputation with :class:`sklearn.imputeIterativeImputer`::
+
+    :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression
+    :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression
+    :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R
+    :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN
+    imputation approaches
+
+Of particular interest is the ability of
+:class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a
+popular imputation package for R. In this example, we have chosen to use
+:class:`sklearn.ensemble.ExtraTreesRegressor` instead of
+:class:`sklearn.ensemble.RandomForestRegressor` (as in missForest) due to its
+increased speed.
+
+Note that :class:`sklearn.neighbors.KNeighborsRegressor` is different from KNN
+imputation, which learns from samples with missing values by using a distance
+metric that accounts for missing values, rather than imputing them.
+
+The goal is to compare different predictors to see which one is best for the
+:class:`sklearn.impute.IterativeImputer` when using a
+:class:`sklearn.linear_model.BayesianRidge` estimator on the California housing
+dataset with a single value randomly removed from each row.
+
+For this particular pattern of missing values we see that
+:class:`sklearn.ensemble.ExtraTreesRegressor` and
+:class:`sklearn.linear_model.BayesianRidge` give the best results.
+"""
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.datasets import fetch_california_housing
+from sklearn.impute import SimpleImputer
+from sklearn.impute import IterativeImputer
+from sklearn.linear_model import BayesianRidge
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import ExtraTreesRegressor
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.pipeline import make_pipeline
+from sklearn.model_selection import cross_val_score
+
+N_SPLITS = 5
+
+rng = np.random.RandomState(0)
+
+X_full, y_full = fetch_california_housing(return_X_y=True)
+n_samples, n_features = X_full.shape
+
+# Estimate the score on the entire dataset, with no missing values
+br_estimator = BayesianRidge()
+score_full_data = pd.DataFrame(
+    cross_val_score(
+        br_estimator, X_full, y_full, scoring='neg_mean_squared_error',
+        cv=N_SPLITS
+    ),
+    columns=['Full Data']
+)
+
+# Add a single missing value to each row
+X_missing = X_full.copy()
+y_missing = y_full
+missing_samples = np.arange(n_samples)
+missing_features = rng.choice(n_features, n_samples, replace=True)
+X_missing[missing_samples, missing_features] = np.nan
+
+# Estimate the score after imputation (mean and median strategies)
+score_simple_imputer = pd.DataFrame()
+for strategy in ('mean', 'median'):
+    estimator = make_pipeline(
+        SimpleImputer(missing_values=np.nan, strategy=strategy),
+        br_estimator
+    )
+    score_simple_imputer[strategy] = cross_val_score(
+        estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
+        cv=N_SPLITS
+    )
+
+# Estimate the score after iterative imputation of the missing values
+# with different predictors
+predictors = [
+    BayesianRidge(),
+    DecisionTreeRegressor(max_features='sqrt', random_state=0),
+    ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0),
+    KNeighborsRegressor(n_neighbors=15)
+]
+score_iterative_imputer = pd.DataFrame()
+for predictor in predictors:
+    estimator = make_pipeline(
+        IterativeImputer(random_state=0, predictor=predictor),
+        br_estimator
+    )
+    score_iterative_imputer[predictor.__class__.__name__] = \
+        cross_val_score(
+            estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
+            cv=N_SPLITS
+        )
+
+scores = pd.concat(
+    [score_full_data, score_simple_imputer, score_iterative_imputer],
+    keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1
+)
+
+# plot boston results
+fig, ax = plt.subplots(figsize=(13, 6))
+means = -scores.mean()
+errors = scores.std()
+means.plot.barh(xerr=errors, ax=ax)
+ax.set_title('California Housing Regression with Different Imputation Methods')
+ax.set_xlabel('MSE (smaller is better)')
+ax.set_yticks(np.arange(means.shape[0]))
+ax.set_yticklabels([" w/ ".join(label) for label in means.index.get_values()])
+plt.tight_layout(pad=1)
+plt.show()
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index 43d7ddfc497f3..897b66aad246c 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -12,12 +12,13 @@
 round-robin linear regression, treating every variable as an output in
 turn. The version implemented assumes Gaussian (output) variables. If your
 features are obviously non-Normal, consider transforming them to look more
-Normal so as to improve performance.
+Normal so as to potentially improve performance.
 
 In addition of using an imputing method, we can also keep an indication of the
 missing information using :func:`sklearn.impute.MissingIndicator` which might
 carry some information.
 """
+print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
@@ -31,6 +32,19 @@
 
 rng = np.random.RandomState(0)
 
+N_SPLITS = 5
+REGRESSOR = RandomForestRegressor(random_state=0, n_estimators=100)
+
+
+def get_scores_for_imputer(imputer, X_missing, y_missing):
+    estimator = make_pipeline(
+        make_union(imputer, MissingIndicator(missing_values=0)),
+        REGRESSOR)
+    impute_scores = cross_val_score(estimator, X_missing, y_missing,
+                                    scoring='neg_mean_squared_error',
+                                    cv=N_SPLITS)
+    return impute_scores
+
 
 def get_results(dataset):
     X_full, y_full = dataset.data, dataset.target
@@ -38,9 +52,9 @@ def get_results(dataset):
     n_features = X_full.shape[1]
 
     # Estimate the score on the entire dataset, with no missing values
-    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
-    full_scores = cross_val_score(estimator, X_full, y_full,
-                                  scoring='neg_mean_squared_error', cv=5)
+    full_scores = cross_val_score(REGRESSOR, X_full, y_full,
+                                  scoring='neg_mean_squared_error',
+                                  cv=N_SPLITS)
 
     # Add missing values in 75% of the lines
     missing_rate = 0.75
@@ -51,35 +65,27 @@ def get_results(dataset):
                                          dtype=np.bool)))
     rng.shuffle(missing_samples)
     missing_features = rng.randint(0, n_features, n_missing_samples)
-
-    # Estimate the score after replacing missing values by 0
     X_missing = X_full.copy()
     X_missing[np.where(missing_samples)[0], missing_features] = 0
     y_missing = y_full.copy()
-    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
-    zero_impute_scores = cross_val_score(estimator, X_missing, y_missing,
-                                         scoring='neg_mean_squared_error',
-                                         cv=5)
+
+    # Estimate the score after replacing missing values by 0
+    imputer = SimpleImputer(missing_values=0,
+                            strategy='constant',
+                            fill_value=0)
+    zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
 
     # Estimate the score after imputation (mean strategy) of the missing values
-    X_missing = X_full.copy()
-    X_missing[np.where(missing_samples)[0], missing_features] = 0
-    y_missing = y_full.copy()
-    estimator = make_pipeline(
-        make_union(SimpleImputer(missing_values=0, strategy="mean"),
-                   MissingIndicator(missing_values=0)),
-        RandomForestRegressor(random_state=0, n_estimators=100))
-    mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,
-                                         scoring='neg_mean_squared_error',
-                                         cv=5)
+    imputer = SimpleImputer(missing_values=0, strategy="mean")
+    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
 
     # Estimate the score after iterative imputation of the missing values
-    estimator = make_pipeline(
-        make_union(IterativeImputer(missing_values=0, random_state=0),
-                   MissingIndicator(missing_values=0)),
-        RandomForestRegressor(random_state=0, n_estimators=100))
-    iterative_impute_scores = cross_val_score(estimator, X_missing, y_missing,
-                                              scoring='neg_mean_squared_error')
+    imputer = IterativeImputer(missing_values=0,
+                               random_state=0,
+                               n_nearest_features=5)
+    iterative_impute_scores = get_scores_for_imputer(imputer,
+                                                     X_missing,
+                                                     y_missing)
 
     return ((full_scores.mean(), full_scores.std()),
             (zero_impute_scores.mean(), zero_impute_scores.std()),
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 6dfce49f7b1f2..ef4e552260e05 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -556,7 +556,7 @@ def __init__(self,
                  initial_strategy="mean",
                  min_value=None,
                  max_value=None,
-                 verbose=False,
+                 verbose=0,
                  random_state=None):
 
         self.missing_values = missing_values

From 92e731606a43b5594f5c4a2a2f32fe9b8648cc38 Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Tue, 12 Feb 2019 12:54:30 -0800
Subject: [PATCH 18/20] ENH IterativeImputer: n_iter->max_iter (#13061)

---
 doc/modules/impute.rst                        |  14 +-
 ...t_iterative_imputer_variants_comparison.py |  16 +-
 sklearn/impute.py                             | 217 +++++++++++-------
 sklearn/tests/test_impute.py                  | 183 ++++++++++++---
 4 files changed, 294 insertions(+), 136 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 1db20e9c6dcdb..6de5df8b12729 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -100,17 +100,17 @@ fashion: at each step, a feature column is designated as output ``y`` and the
 other feature columns are treated as inputs ``X``. A regressor is fit on ``(X,
 y)`` for known ``y``. Then, the regressor is used to predict the missing values
 of ``y``.  This is done for each feature in an iterative fashion, and then is
-repeated for ``n_iter`` imputation rounds. The results of the final imputation
-round are returned.
+repeated for ``max_iter`` imputation rounds. The results of the final
+imputation round are returned.
 
     >>> import numpy as np
     >>> from sklearn.impute import IterativeImputer
-    >>> imp = IterativeImputer(n_iter=10, random_state=0)
+    >>> imp = IterativeImputer(max_iter=10, random_state=0)
     >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
-    IterativeImputer(imputation_order='ascending', initial_strategy='mean',
-        max_value=None, min_value=None, missing_values=nan, n_iter=10,
-        n_nearest_features=None, predictor=None, random_state=0,
-        sample_posterior=False, verbose=0)
+    IterativeImputer(estimator=None, imputation_order='ascending',
+                     initial_strategy='mean', max_iter=10, max_value=None,
+                     min_value=None, missing_values=nan, n_nearest_features=None,
+                     random_state=0, sample_posterior=False, tol=0.001, verbose=0)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     >>> # the model learns that the second feature is double the first
     >>> print(np.round(imp.transform(X_test)))
diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index a850deb273f24..77a12e87a1e8a 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -4,10 +4,10 @@
 =========================================================
 
 The :class:`sklearn.impute.IterativeImputer` class is very flexible - it can be
-used with a variety of predictors to do round-robin regression, treating every
+used with a variety of estimators to do round-robin regression, treating every
 variable as an output in turn.
 
-In this example we compare some predictors for the purpose of missing feature
+In this example we compare some estimators for the purpose of missing feature
 imputation with :class:`sklearn.imputeIterativeImputer`::
 
     :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression
@@ -27,7 +27,7 @@
 imputation, which learns from samples with missing values by using a distance
 metric that accounts for missing values, rather than imputing them.
 
-The goal is to compare different predictors to see which one is best for the
+The goal is to compare different estimators to see which one is best for the
 :class:`sklearn.impute.IterativeImputer` when using a
 :class:`sklearn.linear_model.BayesianRidge` estimator on the California housing
 dataset with a single value randomly removed from each row.
@@ -89,20 +89,20 @@
     )
 
 # Estimate the score after iterative imputation of the missing values
-# with different predictors
-predictors = [
+# with different estimators
+estimators = [
     BayesianRidge(),
     DecisionTreeRegressor(max_features='sqrt', random_state=0),
     ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0),
     KNeighborsRegressor(n_neighbors=15)
 ]
 score_iterative_imputer = pd.DataFrame()
-for predictor in predictors:
+for estimator in estimators:
     estimator = make_pipeline(
-        IterativeImputer(random_state=0, predictor=predictor),
+        IterativeImputer(random_state=0, estimator=estimator),
         br_estimator
     )
-    score_iterative_imputer[predictor.__class__.__name__] = \
+    score_iterative_imputer[estimator.__class__.__name__] = \
         cross_val_score(
             estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
             cv=N_SPLITS
diff --git a/sklearn/impute.py b/sklearn/impute.py
index ef4e552260e05..c8ae55ffa318c 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -19,6 +19,7 @@
 
 from .base import BaseEstimator, TransformerMixin
 from .base import clone
+from .exceptions import ConvergenceWarning
 from .preprocessing import normalize
 from .utils import check_array, check_random_state, safe_indexing
 from .utils.sparsefuncs import _get_median
@@ -30,7 +31,7 @@
 
 ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx',
                                                'neighbor_feat_idx',
-                                               'predictor'])
+                                               'estimator'])
 
 __all__ = [
     'MissingIndicator',
@@ -436,36 +437,28 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         The placeholder for the missing values. All occurrences of
         ``missing_values`` will be imputed.
 
-    imputation_order : str, optional (default="ascending")
-        The order in which the features will be imputed. Possible values:
-
-        "ascending"
-            From features with fewest missing values to most.
-        "descending"
-            From features with most missing values to fewest.
-        "roman"
-            Left to right.
-        "arabic"
-            Right to left.
-        "random"
-            A random order for each round.
-
-    n_iter : int, optional (default=10)
-        Number of imputation rounds to perform before returning the imputations
-        computed during the final round. A round is a single imputation of each
-        feature with missing values.
-
-    predictor : estimator object, default=BayesianRidge()
-        The predictor to use at each step of the round-robin imputation.
-        If ``sample_posterior`` is True, the predictor must support
+    estimator : estimator object, default=BayesianRidge()
+        The estimator to use at each step of the round-robin imputation.
+        If ``sample_posterior`` is True, the estimator must support
         ``return_std`` in its ``predict`` method.
 
     sample_posterior : boolean, default=False
         Whether to sample from the (Gaussian) predictive posterior of the
-        fitted predictor for each imputation. Predictor must support
+        fitted estimator for each imputation. Estimator must support
         ``return_std`` in its ``predict`` method if set to ``True``. Set to
         ``True`` if using ``IterativeImputer`` for multiple imputations.
 
+    max_iter : int, optional (default=10)
+        Maximum number of imputation rounds to perform before returning the
+        imputations computed during the final round. A round is a single
+        imputation of each feature with missing values. The stopping criterion
+        is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol,
+        where `X_t` is `X` at iteration `t. Note that early stopping is only
+        applied if ``sample_posterior=False``.
+
+    tol : float, optional (default=1e-3)
+        Tolerance of the stopping condition.
+
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
         each feature column. Nearness between features is measured using
@@ -481,6 +474,20 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
         Valid values: {"mean", "median", "most_frequent", or "constant"}.
 
+    imputation_order : str, optional (default="ascending")
+        The order in which the features will be imputed. Possible values:
+
+        "ascending"
+            From features with fewest missing values to most.
+        "descending"
+            From features with most missing values to fewest.
+        "roman"
+            Left to right.
+        "arabic"
+            Right to left.
+        "random"
+            A random order for each round.
+
     min_value : float, optional (default=None)
         Minimum possible imputed value. Default of ``None`` will set minimum
         to negative infinity.
@@ -496,7 +503,7 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
 
     random_state : int, RandomState instance or None, optional (default=None)
         The seed of the pseudo random number generator to use. Randomizes
-        selection of predictor features if n_nearest_features is not None, the
+        selection of estimator features if n_nearest_features is not None, the
         ``imputation_order`` if ``random``, and the sampling from posterior if
         ``sample_posterior`` is True. Use an integer for determinism.
         See :term:`the Glossary <random_state>`.
@@ -507,11 +514,16 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         Imputer used to initialize the missing values.
 
     imputation_sequence_ : list of tuples
-        Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where
+        Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where
         ``feat_idx`` is the current feature to be imputed,
         ``neighbor_feat_idx`` is the array of other features used to impute the
-        current feature, and ``predictor`` is the trained predictor used for
-        the imputation. Length is ``self.n_features_with_missing_ * n_iter``.
+        current feature, and ``estimator`` is the trained estimator used for
+        the imputation. Length is ``self.n_features_with_missing_ *
+        self.n_iter_``.
+
+    n_iter_ : int
+        Number of iteration rounds that occurred. Will be less than
+        ``self.max_iter`` if early stopping criterion was reached.
 
     n_features_with_missing_ : int
         Number of features with missing values.
@@ -522,7 +534,7 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
 
     Notes
     -----
-    To support imputation in inductive mode we store each feature's predictor
+    To support imputation in inductive mode we store each feature's estimator
     during the ``fit`` phase, and predict without refitting (in order) during
     the ``transform`` phase.
 
@@ -548,24 +560,26 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
 
     def __init__(self,
                  missing_values=np.nan,
-                 imputation_order='ascending',
-                 n_iter=10,
-                 predictor=None,
+                 estimator=None,
                  sample_posterior=False,
+                 max_iter=10,
+                 tol=1e-3,
                  n_nearest_features=None,
                  initial_strategy="mean",
+                 imputation_order='ascending',
                  min_value=None,
                  max_value=None,
                  verbose=0,
                  random_state=None):
 
         self.missing_values = missing_values
-        self.imputation_order = imputation_order
-        self.n_iter = n_iter
-        self.predictor = predictor
+        self.estimator = estimator
         self.sample_posterior = sample_posterior
+        self.max_iter = max_iter
+        self.tol = tol
         self.n_nearest_features = n_nearest_features
         self.initial_strategy = initial_strategy
+        self.imputation_order = imputation_order
         self.min_value = min_value
         self.max_value = max_value
         self.verbose = verbose
@@ -576,12 +590,12 @@ def _impute_one_feature(self,
                             mask_missing_values,
                             feat_idx,
                             neighbor_feat_idx,
-                            predictor=None,
+                            estimator=None,
                             fit_mode=True):
         """Impute a single feature from the others provided.
 
         This function predicts the missing values of one of the features using
-        the current estimates of all the other features. The ``predictor`` must
+        the current estimates of all the other features. The ``estimator`` must
         support ``return_std=True`` in its ``predict`` method for this function
         to work.
 
@@ -599,22 +613,22 @@ def _impute_one_feature(self,
         neighbor_feat_idx : ndarray
             Indices of the features to be used in imputing ``feat_idx``.
 
-        predictor : object
-            The predictor to use at this step of the round-robin imputation.
-            If ``sample_posterior`` is True, the predictor must support
+        estimator : object
+            The estimator to use at this step of the round-robin imputation.
+            If ``sample_posterior`` is True, the estimator must support
             ``return_std`` in its ``predict`` method.
-            If None, it will be cloned from self._predictor.
+            If None, it will be cloned from self._estimator.
 
         fit_mode : boolean, default=True
-            Whether to fit and predict with the predictor or just predict.
+            Whether to fit and predict with the estimator or just predict.
 
         Returns
         -------
         X_filled : ndarray
             Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
 
-        predictor : predictor with sklearn API
-            The fitted predictor used to impute
+        estimator : estimator with sklearn API
+            The fitted estimator used to impute
             ``X_filled[missing_row_mask, feat_idx]``.
         """
 
@@ -622,38 +636,46 @@ def _impute_one_feature(self,
         # (should not happen at fit time because feat_ids would be excluded)
         missing_row_mask = mask_missing_values[:, feat_idx]
         if not np.any(missing_row_mask):
-            return X_filled, predictor
+            return X_filled, estimator
 
-        if predictor is None and fit_mode is False:
+        if estimator is None and fit_mode is False:
             raise ValueError("If fit_mode is False, then an already-fitted "
-                             "predictor should be passed in.")
+                             "estimator should be passed in.")
 
-        if predictor is None:
-            predictor = clone(self._predictor)
+        if estimator is None:
+            estimator = clone(self._estimator)
 
         if fit_mode:
             X_train = safe_indexing(X_filled[:, neighbor_feat_idx],
                                     ~missing_row_mask)
             y_train = safe_indexing(X_filled[:, feat_idx],
                                     ~missing_row_mask)
-            predictor.fit(X_train, y_train)
+            estimator.fit(X_train, y_train)
 
         # get posterior samples
         X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
                                missing_row_mask)
         if self.sample_posterior:
-            mus, sigmas = predictor.predict(X_test, return_std=True)
-            good_sigmas = sigmas > 0
+            mus, sigmas = estimator.predict(X_test, return_std=True)
             imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
-            imputed_values[~good_sigmas] = mus[~good_sigmas]
-            mus = mus[good_sigmas]
-            sigmas = sigmas[good_sigmas]
+            # two types of problems: (1) non-positive sigmas, (2) mus outside
+            # legal range of min_value and max_value (results in inf sample)
+            positive_sigmas = sigmas > 0
+            imputed_values[~positive_sigmas] = mus[~positive_sigmas]
+            mus_too_low = mus < self._min_value
+            imputed_values[mus_too_low] = self._min_value
+            mus_too_high = mus > self._max_value
+            imputed_values[mus_too_high] = self._max_value
+            # the rest can be sampled without statistical issues
+            inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
+            mus = mus[inrange_mask]
+            sigmas = sigmas[inrange_mask]
             a = (self._min_value - mus) / sigmas
             b = (self._max_value - mus) / sigmas
 
             if scipy.__version__ < LooseVersion('0.18'):
                 # bug with vector-valued `a` in old scipy
-                imputed_values[good_sigmas] = [
+                imputed_values[inrange_mask] = [
                     stats.truncnorm(a=a_, b=b_,
                                     loc=loc_, scale=scale_).rvs(
                                         random_state=self.random_state_)
@@ -662,17 +684,17 @@ def _impute_one_feature(self,
             else:
                 truncated_normal = stats.truncnorm(a=a, b=b,
                                                    loc=mus, scale=sigmas)
-                imputed_values[good_sigmas] = truncated_normal.rvs(
+                imputed_values[inrange_mask] = truncated_normal.rvs(
                     random_state=self.random_state_)
         else:
-            imputed_values = predictor.predict(X_test)
+            imputed_values = estimator.predict(X_test)
             imputed_values = np.clip(imputed_values,
                                      self._min_value,
                                      self._max_value)
 
         # update the feature
         X_filled[missing_row_mask, feat_idx] = imputed_values
-        return X_filled, predictor
+        return X_filled, estimator
 
     def _get_neighbor_feat_idx(self,
                                n_features,
@@ -859,19 +881,27 @@ def fit_transform(self, X, y=None):
         self.random_state_ = getattr(self, "random_state_",
                                      check_random_state(self.random_state))
 
-        if self.n_iter < 0:
+        if self.max_iter < 0:
             raise ValueError(
-                "'n_iter' should be a positive integer. Got {} instead."
-                .format(self.n_iter))
+                "'max_iter' should be a positive integer. Got {} instead."
+                .format(self.max_iter))
+
+        if self.tol < 0:
+            raise ValueError(
+                "'tol' should be a non-negative float. Got {} instead."
+                .format(self.tol)
+            )
 
-        if self.predictor is None:
+        if self.estimator is None:
             from .linear_model import BayesianRidge
-            self._predictor = BayesianRidge()
+            self._estimator = BayesianRidge()
         else:
-            self._predictor = clone(self.predictor)
+            self._estimator = clone(self.estimator)
 
-        if hasattr(self._predictor, 'random_state'):
-            self._predictor.random_state = self.random_state_
+        self.imputation_sequence_ = []
+
+        if hasattr(self._estimator, 'random_state'):
+            self._estimator.random_state = self.random_state_
 
         self._min_value = -np.inf if self.min_value is None else self.min_value
         self._max_value = np.inf if self.max_value is None else self.max_value
@@ -879,7 +909,8 @@ def fit_transform(self, X, y=None):
         self.initial_imputer_ = None
         X, Xt, mask_missing_values = self._initial_imputation(X)
 
-        if self.n_iter == 0:
+        if self.max_iter == 0 or np.all(mask_missing_values):
+            self.n_iter_ = 0
             return Xt
 
         # order in which to impute
@@ -891,14 +922,15 @@ def fit_transform(self, X, y=None):
 
         abs_corr_mat = self._get_abs_corr_mat(Xt)
 
-        # impute data
         n_samples, n_features = Xt.shape
-        self.imputation_sequence_ = []
         if self.verbose > 0:
             print("[IterativeImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
-        for i_rnd in range(self.n_iter):
+        if not self.sample_posterior:
+            Xt_previous = Xt.copy()
+            normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
+        for self.n_iter_ in range(1, self.max_iter + 1):
             if self.imputation_order == 'random':
                 ordered_idx = self._get_ordered_idx(mask_missing_values)
 
@@ -906,19 +938,32 @@ def fit_transform(self, X, y=None):
                 neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
                                                                 feat_idx,
                                                                 abs_corr_mat)
-                Xt, predictor = self._impute_one_feature(
+                Xt, estimator = self._impute_one_feature(
                     Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
-                    predictor=None, fit_mode=True)
-                predictor_triplet = ImputerTriplet(feat_idx,
+                    estimator=None, fit_mode=True)
+                estimator_triplet = ImputerTriplet(feat_idx,
                                                    neighbor_feat_idx,
-                                                   predictor)
-                self.imputation_sequence_.append(predictor_triplet)
+                                                   estimator)
+                self.imputation_sequence_.append(estimator_triplet)
 
-            if self.verbose > 0:
+            if self.verbose > 1:
                 print('[IterativeImputer] Ending imputation round '
                       '%d/%d, elapsed time %0.2f'
-                      % (i_rnd + 1, self.n_iter, time() - start_t))
-
+                      % (self.n_iter_, self.max_iter, time() - start_t))
+
+            if not self.sample_posterior:
+                inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf,
+                                          axis=None)
+                if inf_norm < normalized_tol:
+                    if self.verbose > 0:
+                        print('[IterativeImputer] Early stopping criterion '
+                              'reached.')
+                    break
+                Xt_previous = Xt.copy()
+        else:
+            if not self.sample_posterior:
+                warnings.warn("[IterativeImputer] Early stopping criterion not"
+                              " reached.", ConvergenceWarning)
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
@@ -942,29 +987,29 @@ def transform(self, X):
 
         X, Xt, mask_missing_values = self._initial_imputation(X)
 
-        if self.n_iter == 0:
+        if self.n_iter_ == 0 or np.all(mask_missing_values):
             return Xt
 
-        imputations_per_round = len(self.imputation_sequence_) // self.n_iter
+        imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
         i_rnd = 0
         if self.verbose > 0:
             print("[IterativeImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
-        for it, predictor_triplet in enumerate(self.imputation_sequence_):
+        for it, estimator_triplet in enumerate(self.imputation_sequence_):
             Xt, _ = self._impute_one_feature(
                 Xt,
                 mask_missing_values,
-                predictor_triplet.feat_idx,
-                predictor_triplet.neighbor_feat_idx,
-                predictor=predictor_triplet.predictor,
+                estimator_triplet.feat_idx,
+                estimator_triplet.neighbor_feat_idx,
+                estimator=estimator_triplet.estimator,
                 fit_mode=False
             )
             if not (it + 1) % imputations_per_round:
                 if self.verbose > 1:
                     print('[IterativeImputer] Ending imputation round '
                           '%d/%d, elapsed time %0.2f'
-                          % (i_rnd + 1, self.n_iter, time() - start_t))
+                          % (i_rnd + 1, self.n_iter_, time() - start_t))
                 i_rnd += 1
 
         Xt[~mask_missing_values] = X[~mask_missing_values]
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index a2bf8d75ef9e5..9d063dd33bec2 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -508,6 +508,55 @@ def test_imputation_copy():
     # made, even if copy=False.
 
 
+def test_iterative_imputer_zero_iters():
+    rng = np.random.RandomState(0)
+
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    missing_flag = X == 0
+    X[missing_flag] = np.nan
+
+    imputer = IterativeImputer(max_iter=0)
+    X_imputed = imputer.fit_transform(X)
+    # with max_iter=0, only initial imputation is performed
+    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
+
+    # repeat but force n_iter_ to 0
+    imputer = IterativeImputer(max_iter=5).fit(X)
+    # transformed should not be equal to initial imputation
+    assert not np.all(imputer.transform(X) ==
+                      imputer.initial_imputer_.transform(X))
+
+    imputer.n_iter_ = 0
+    # now they should be equal as only initial imputation is done
+    assert_allclose(imputer.transform(X),
+                    imputer.initial_imputer_.transform(X))
+
+
+def test_iterative_imputer_verbose():
+    rng = np.random.RandomState(0)
+
+    n = 100
+    d = 3
+    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1)
+    imputer.fit(X)
+    imputer.transform(X)
+    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2)
+    imputer.fit(X)
+    imputer.transform(X)
+
+
+def test_iterative_imputer_all_missing():
+    n = 100
+    d = 3
+    X = np.zeros((n, d))
+    imputer = IterativeImputer(missing_values=0, max_iter=1)
+    X_imputed = imputer.fit_transform(X)
+    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
+
+
 @pytest.mark.parametrize(
     "imputation_order",
     ['random', 'roman', 'ascending', 'descending', 'arabic']
@@ -516,22 +565,24 @@ def test_iterative_imputer_imputation_order(imputation_order):
     rng = np.random.RandomState(0)
     n = 100
     d = 10
+    max_iter = 2
     X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
     X[:, 0] = 1  # this column should not be discarded by IterativeImputer
 
-    n_iter = 2
     imputer = IterativeImputer(missing_values=0,
-                               n_iter=n_iter,
+                               max_iter=max_iter,
                                n_nearest_features=5,
+                               sample_posterior=False,
                                min_value=0,
                                max_value=1,
-                               verbose=False,
+                               verbose=1,
                                imputation_order=imputation_order,
                                random_state=rng)
     imputer.fit_transform(X)
     ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
 
-    assert len(ordered_idx) // n_iter == imputer.n_features_with_missing_
+    assert (len(ordered_idx) // imputer.n_iter_ ==
+            imputer.n_features_with_missing_)
 
     if imputation_order == 'roman':
         assert np.all(ordered_idx[:d-1] == np.arange(1, d))
@@ -542,14 +593,14 @@ def test_iterative_imputer_imputation_order(imputation_order):
         ordered_idx_round_2 = ordered_idx[d-1:]
         assert ordered_idx_round_1 != ordered_idx_round_2
     elif 'ending' in imputation_order:
-        assert len(ordered_idx) == n_iter * (d - 1)
+        assert len(ordered_idx) == max_iter * (d - 1)
 
 
 @pytest.mark.parametrize(
-    "predictor",
+    "estimator",
     [None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()]
 )
-def test_iterative_imputer_predictors(predictor):
+def test_iterative_imputer_estimators(estimator):
     rng = np.random.RandomState(0)
 
     n = 100
@@ -557,20 +608,20 @@ def test_iterative_imputer_predictors(predictor):
     X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
 
     imputer = IterativeImputer(missing_values=0,
-                               n_iter=1,
-                               predictor=predictor,
+                               max_iter=1,
+                               estimator=estimator,
                                random_state=rng)
     imputer.fit_transform(X)
 
-    # check that types are correct for predictors
+    # check that types are correct for estimators
     hashes = []
     for triplet in imputer.imputation_sequence_:
-        expected_type = (type(predictor) if predictor is not None
+        expected_type = (type(estimator) if estimator is not None
                          else type(BayesianRidge()))
-        assert isinstance(triplet.predictor, expected_type)
-        hashes.append(id(triplet.predictor))
+        assert isinstance(triplet.estimator, expected_type)
+        hashes.append(id(triplet.estimator))
 
-    # check that each predictor is unique
+    # check that each estimator is unique
     assert len(set(hashes)) == len(hashes)
 
 
@@ -582,7 +633,7 @@ def test_iterative_imputer_clip():
                              random_state=rng).toarray()
 
     imputer = IterativeImputer(missing_values=0,
-                               n_iter=1,
+                               max_iter=1,
                                min_value=0.1,
                                max_value=0.2,
                                random_state=rng)
@@ -593,12 +644,37 @@ def test_iterative_imputer_clip():
     assert_allclose(Xt[X != 0], X[X != 0])
 
 
+def test_iterative_imputer_clip_truncnorm():
+    rng = np.random.RandomState(0)
+    n = 100
+    d = 10
+    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
+    X[:, 0] = 1
+
+    imputer = IterativeImputer(missing_values=0,
+                               max_iter=2,
+                               n_nearest_features=5,
+                               sample_posterior=True,
+                               min_value=0.1,
+                               max_value=0.2,
+                               verbose=1,
+                               imputation_order='random',
+                               random_state=rng)
+    Xt = imputer.fit_transform(X)
+    assert_allclose(np.min(Xt[X == 0]), 0.1)
+    assert_allclose(np.max(Xt[X == 0]), 0.2)
+    assert_allclose(Xt[X != 0], X[X != 0])
+
+
 def test_iterative_imputer_truncated_normal_posterior():
     #  test that the values that are imputed using `sample_posterior=True`
     #  with boundaries (`min_value` and `max_value` are not None) are drawn
-    #  from a distribution that looks gaussian via the Kolmogorov Smirnov test
+    #  from a distribution that looks gaussian via the Kolmogorov Smirnov test.
+    #  note that starting from the wrong random seed will make this test fail
+    #  because random sampling doesn't occur at all when the imputation
+    #  is outside of the (min_value, max_value) range
     pytest.importorskip("scipy", minversion="0.17.0")
-    rng = np.random.RandomState(0)
+    rng = np.random.RandomState(42)
 
     X = rng.normal(size=(5, 5))
     X[0][0] = np.nan
@@ -610,7 +686,7 @@ def test_iterative_imputer_truncated_normal_posterior():
 
     imputer.fit_transform(X)
     # generate multiple imputations for the single missing value
-    imputations = np.array([imputer.transform(X)[0][0] for _ in range(1000)])
+    imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)])
 
     assert all(imputations >= 0)
     assert all(imputations <= 0.5)
@@ -641,7 +717,7 @@ def test_iterative_imputer_missing_at_transform(strategy):
     X_test[0, 0] = 0  # definitely missing value in 0th column
 
     imputer = IterativeImputer(missing_values=0,
-                               n_iter=1,
+                               max_iter=1,
                                initial_strategy=strategy,
                                random_state=rng).fit(X_train)
     initial_imputer = SimpleImputer(missing_values=0,
@@ -664,7 +740,7 @@ def test_iterative_imputer_transform_stochasticity():
 
     # when sample_posterior=True, two transforms shouldn't be equal
     imputer = IterativeImputer(missing_values=0,
-                               n_iter=1,
+                               max_iter=1,
                                sample_posterior=True,
                                random_state=rng1)
     imputer.fit(X)
@@ -679,14 +755,14 @@ def test_iterative_imputer_transform_stochasticity():
     # and imputation_order is not random
     # the two transforms should be identical even if rng are different
     imputer1 = IterativeImputer(missing_values=0,
-                                n_iter=1,
+                                max_iter=1,
                                 sample_posterior=False,
                                 n_nearest_features=None,
                                 imputation_order='ascending',
                                 random_state=rng1)
 
     imputer2 = IterativeImputer(missing_values=0,
-                                n_iter=1,
+                                max_iter=1,
                                 sample_posterior=False,
                                 n_nearest_features=None,
                                 imputation_order='ascending',
@@ -706,8 +782,8 @@ def test_iterative_imputer_no_missing():
     rng = np.random.RandomState(0)
     X = rng.rand(100, 100)
     X[:, 0] = np.nan
-    m1 = IterativeImputer(n_iter=10, random_state=rng)
-    m2 = IterativeImputer(n_iter=10, random_state=rng)
+    m1 = IterativeImputer(max_iter=10, random_state=rng)
+    m2 = IterativeImputer(max_iter=10, random_state=rng)
     pred1 = m1.fit(X).transform(X)
     pred2 = m2.fit_transform(X)
     # should exclude the first column entirely
@@ -726,7 +802,7 @@ def test_iterative_imputer_rank_one():
     X_missing = X.copy()
     X_missing[nan_mask] = np.nan
 
-    imputer = IterativeImputer(n_iter=5,
+    imputer = IterativeImputer(max_iter=5,
                                verbose=1,
                                random_state=rng)
     X_filled = imputer.fit_transform(X_missing)
@@ -744,7 +820,6 @@ def test_iterative_imputer_transform_recovery(rank):
     A = rng.rand(n, rank)
     B = rng.rand(rank, d)
     X_filled = np.dot(A, B)
-    # half is randomly missing
     nan_mask = rng.rand(n, d) < 0.5
     X_missing = X_filled.copy()
     X_missing[nan_mask] = np.nan
@@ -755,7 +830,7 @@ def test_iterative_imputer_transform_recovery(rank):
     X_test_filled = X_filled[n:]
     X_test = X_missing[n:]
 
-    imputer = IterativeImputer(n_iter=10,
+    imputer = IterativeImputer(max_iter=10,
                                verbose=1,
                                random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
@@ -783,21 +858,59 @@ def test_iterative_imputer_additive_matrix():
     X_test_filled = X_filled[n:]
     X_test = X_missing[n:]
 
-    imputer = IterativeImputer(n_iter=10,
-                               verbose=2,
+    imputer = IterativeImputer(max_iter=10,
+                               verbose=1,
                                random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
-    assert_allclose(X_test_filled, X_test_est, atol=0.01)
+    assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
 
 
-def test_iterative_imputer_error_param():
-    rng = np.random.RandomState(42)
-    X = rng.randn(100, 2)
-    imputer = IterativeImputer(n_iter=-1)
-    with pytest.raises(ValueError, match='should be a positive integer'):
+@pytest.mark.parametrize("max_iter, tol, error_type, warning", [
+    (-1, 1e-3, ValueError, 'should be a positive integer'),
+    (1, -1e-3, ValueError, 'should be a non-negative float')
+])
+def test_iterative_imputer_error_param(max_iter, tol, error_type, warning):
+    X = np.zeros((100, 2))
+    imputer = IterativeImputer(max_iter=max_iter, tol=tol)
+    with pytest.raises(error_type, match=warning):
         imputer.fit_transform(X)
 
 
+def test_iterative_imputer_early_stopping():
+    rng = np.random.RandomState(0)
+    n = 50
+    d = 5
+    A = rng.rand(n, 1)
+    B = rng.rand(1, d)
+    X = np.dot(A, B)
+    nan_mask = rng.rand(n, d) < 0.5
+    X_missing = X.copy()
+    X_missing[nan_mask] = np.nan
+
+    imputer = IterativeImputer(max_iter=100,
+                               tol=1e-3,
+                               sample_posterior=False,
+                               verbose=1,
+                               random_state=rng)
+    X_filled_100 = imputer.fit_transform(X_missing)
+    assert len(imputer.imputation_sequence_) == d * imputer.n_iter_
+
+    imputer = IterativeImputer(max_iter=imputer.n_iter_,
+                               sample_posterior=False,
+                               verbose=1,
+                               random_state=rng)
+    X_filled_early = imputer.fit_transform(X_missing)
+    assert_allclose(X_filled_100, X_filled_early, atol=1e-7)
+
+    imputer = IterativeImputer(max_iter=100,
+                               tol=0,
+                               sample_posterior=False,
+                               verbose=1,
+                               random_state=rng)
+    imputer.fit(X_missing)
+    assert imputer.n_iter_ == imputer.max_iter
+
+
 @pytest.mark.parametrize(
     "X_fit, X_trans, params, msg_err",
     [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]),

From cb3ec846872a6a43d79353b0a3cde06a82662b64 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 13 Feb 2019 08:51:12 +1100
Subject: [PATCH 19/20] pep8

---
 sklearn/utils/estimator_checks.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 29e8eec5bceb2..6c4196d919aa0 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -73,9 +73,10 @@
                 'RANSACRegressor', 'RadiusNeighborsRegressor',
                 'RandomForestRegressor', 'Ridge', 'RidgeCV']
 
-ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator', 'IterativeImputer',
+ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator',
              'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
-             'PowerTransformer', 'QuantileTransformer']
+             'PowerTransformer', 'QuantileTransformer', 'IterativeImputer']
+
 SUPPORT_STRING = ['SimpleImputer', 'MissingIndicator']
 
 

From c12344051aa76907df195b27f39f4c465a94bdff Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Tue, 12 Feb 2019 21:42:04 -0800
Subject: [PATCH 20/20] API estimator is now first param of IterativeImputer
 (#13153)

---
 sklearn/impute.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index b2459a003b16f..3bb0bdd9eff15 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -433,15 +433,15 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    missing_values : int, np.nan, optional (default=np.nan)
-        The placeholder for the missing values. All occurrences of
-        ``missing_values`` will be imputed.
-
     estimator : estimator object, default=BayesianRidge()
         The estimator to use at each step of the round-robin imputation.
         If ``sample_posterior`` is True, the estimator must support
         ``return_std`` in its ``predict`` method.
 
+    missing_values : int, np.nan, optional (default=np.nan)
+        The placeholder for the missing values. All occurrences of
+        ``missing_values`` will be imputed.
+
     sample_posterior : boolean, default=False
         Whether to sample from the (Gaussian) predictive posterior of the
         fitted estimator for each imputation. Estimator must support
@@ -559,8 +559,8 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
     """
 
     def __init__(self,
-                 missing_values=np.nan,
                  estimator=None,
+                 missing_values=np.nan,
                  sample_posterior=False,
                  max_iter=10,
                  tol=1e-3,
@@ -572,8 +572,8 @@ def __init__(self,
                  verbose=0,
                  random_state=None):
 
-        self.missing_values = missing_values
         self.estimator = estimator
+        self.missing_values = missing_values
         self.sample_posterior = sample_posterior
         self.max_iter = max_iter
         self.tol = tol