From fa148c983e47d27836a17cb07c968d479d464fd8 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 22 Jan 2019 16:26:47 +1100
Subject: [PATCH 1/9] Debugging a doctest heisenbug: Add unit test equivalent

---
 sklearn/tests/test_impute.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index fd2bbd4ec5ad0..f9d0cae82c64a 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -931,3 +931,14 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor,
 
     with pytest.raises(ValueError, match=err_msg):
         imputer.fit_transform(X)
+
+
+def test_heisenbug():
+    imp = IterativeImputer(n_iter=10, random_state=0)
+    imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
+    X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
+    assert_allclose(
+        np.round(imp.transform(X_test)),
+        [[1., 2.]
+         [6., 3.]
+         [26., 6.]])

From 09bb175d2dda49b813490453c92daa96655d4af3 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 22 Jan 2019 17:04:05 +1100
Subject: [PATCH 2/9] Fix copy-paste error

---
 sklearn/tests/test_impute.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index f9d0cae82c64a..124a75368b752 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -939,6 +939,6 @@ def test_heisenbug():
     X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     assert_allclose(
         np.round(imp.transform(X_test)),
-        [[1., 2.]
-         [6., 3.]
+        [[1., 2.],
+         [6., 3.],
          [26., 6.]])

From c30412073533fe4d184d17dc3c113c7efd7907a1 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 22 Jan 2019 18:28:53 +1100
Subject: [PATCH 3/9] Some debug output

---
 sklearn/impute.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index f888d7fe83d4f..d7ac2fa0fc347 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -968,6 +968,9 @@ def transform(self, X):
                 predictor=predictor_triplet.predictor,
                 fit_mode=False
             )
+            print(it, 'int:', predictor_triplet.predictor.intercept_,
+                  'coef:', predictor_triplet.predictor.coef_)
+            print('out:', Xt)
             if not (it + 1) % imputations_per_round:
                 if self.verbose > 1:
                     print('[IterativeImputer] Ending imputation round '

From 2f5694c33f2b664ceb3a48cb004a1bb7c57d2adc Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 22 Jan 2019 18:55:12 +1100
Subject: [PATCH 4/9] Show transformation during training

---
 sklearn/impute.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index d7ac2fa0fc347..f16821a0c272f 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -917,6 +917,7 @@ def fit_transform(self, X, y=None):
                 Xt, predictor = self._impute_one_feature(
                     Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
                     predictor=None, fit_mode=True)
+                print(Xt)
                 predictor_triplet = ImputerTriplet(feat_idx,
                                                    neighbor_feat_idx,
                                                    predictor)

From efea77888ecfdb024f73a92c787c587dcb35b3d3 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 22 Jan 2019 19:02:55 +1100
Subject: [PATCH 5/9] Show alpha

---
 sklearn/impute.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index f16821a0c272f..cd9632dc70156 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -970,7 +970,8 @@ def transform(self, X):
                 fit_mode=False
             )
             print(it, 'int:', predictor_triplet.predictor.intercept_,
-                  'coef:', predictor_triplet.predictor.coef_)
+                  'coef:', predictor_triplet.predictor.coef_,
+                  'alpha:', predictor_triplet.predictor.alpha_)
             print('out:', Xt)
             if not (it + 1) % imputations_per_round:
                 if self.verbose > 1:

From 83c481df29af98771454a784e018ec8639a2292e Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 23 Jan 2019 11:38:07 +1100
Subject: [PATCH 6/9] Make the example more obvious

---
 doc/modules/impute.rst                  |   7 +-
 sklearn/cluster/tests/test_bicluster.py |  19 ++
 sklearn/impute.py                       | 410 +++++++++++-------------
 sklearn/tests/test_impute.py            |  11 -
 4 files changed, 212 insertions(+), 235 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 1d1f6e926e8f8..76d170e506c2a 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -106,16 +106,17 @@ round are returned.
     >>> import numpy as np
     >>> from sklearn.impute import IterativeImputer
     >>> imp = IterativeImputer(n_iter=10, random_state=0)
-    >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
+    >>> imp.fit([[1, 2], [3, 6], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
     IterativeImputer(imputation_order='ascending', initial_strategy='mean',
         max_value=None, min_value=None, missing_values=nan, n_iter=10,
         n_nearest_features=None, predictor=None, random_state=0,
         sample_posterior=False, verbose=False)
+    # the model learns that the second feature is double the first
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]
-     [ 6.  3.]
-     [26.  6.]]
+     [ 6.  12.]
+     [ 3.  6.]]
 
 Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline
 as a way to build a composite estimator that supports imputation.
diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
index dd5e91c18c27e..40ab9f8961667 100644
--- a/sklearn/cluster/tests/test_bicluster.py
+++ b/sklearn/cluster/tests/test_bicluster.py
@@ -232,6 +232,25 @@ def test_perfect_checkerboard():
                                  (rows, cols)), 1)
 
 
+def test_compare_svd_methods():
+    data = np.array([[-2, -4, 2],
+                     [-2, 1, 2],
+                     [4, 2, 5]])
+
+    model_rand = SpectralCoclustering(n_clusters=2,
+                                      svd_method='randomized',
+                                      random_state=0)
+    model_rand.fit(data)
+
+    model_arpack = SpectralCoclustering(n_clusters=2,
+                                        svd_method='arpack',
+                                        random_state=0)
+    model_arpack.fit(data)
+
+    assert_array_equal(model_rand.column_labels_,
+                       model_arpack.column_labels_)
+
+
 def test_errors():
     data = np.arange(25).reshape((5, 5))
 
diff --git a/sklearn/impute.py b/sklearn/impute.py
index cd9632dc70156..fec9d8b0d7a8d 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -6,13 +6,11 @@
 from __future__ import division
 
 import warnings
-import numbers
 from time import time
-from distutils.version import LooseVersion
+import numbers
 
 import numpy as np
 import numpy.ma as ma
-import scipy
 from scipy import sparse
 from scipy import stats
 from collections import namedtuple
@@ -27,6 +25,10 @@
 from .utils.fixes import _object_dtype_isnan
 from .utils import is_scalar_nan
 
+from .externals import six
+
+zip = six.moves.zip
+map = six.moves.map
 
 ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx',
                                                'neighbor_feat_idx',
@@ -35,7 +37,7 @@
 __all__ = [
     'MissingIndicator',
     'SimpleImputer',
-    'IterativeImputer',
+    'ChainedImputer',
 ]
 
 
@@ -140,6 +142,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
         a new copy will always be made, even if `copy=False`:
 
         - If X is not an array of floating values;
+        - If X is sparse and `missing_values=0`;
         - If X is encoded as a CSR matrix.
 
     Attributes
@@ -147,26 +150,6 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     statistics_ : array of shape (n_features,)
         The imputation fill value for each feature.
 
-    See also
-    --------
-    IterativeImputer : Multivariate imputation of missing values.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.impute import SimpleImputer
-    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
-    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
-    ... # doctest: +NORMALIZE_WHITESPACE
-    SimpleImputer(copy=True, fill_value=None, missing_values=nan,
-           strategy='mean', verbose=0)
-    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
-    >>> print(imp_mean.transform(X))
-    ... # doctest: +NORMALIZE_WHITESPACE
-    [[ 7.   2.   3. ]
-     [ 4.   3.5  6. ]
-     [10.   3.5  9. ]]
-
     Notes
     -----
     Columns which only contained missing values at `fit` are discarded upon
@@ -254,17 +237,10 @@ def fit(self, X, y=None):
                              "data".format(fill_value))
 
         if sparse.issparse(X):
-            # missing_values = 0 not allowed with sparse data as it would
-            # force densification
-            if self.missing_values == 0:
-                raise ValueError("Imputation not possible when missing_values "
-                                 "== 0 and input is sparse. Provide a dense "
-                                 "array instead.")
-            else:
-                self.statistics_ = self._sparse_fit(X,
-                                                    self.strategy,
-                                                    self.missing_values,
-                                                    fill_value)
+            self.statistics_ = self._sparse_fit(X,
+                                                self.strategy,
+                                                self.missing_values,
+                                                fill_value)
         else:
             self.statistics_ = self._dense_fit(X,
                                                self.strategy,
@@ -275,41 +251,80 @@ def fit(self, X, y=None):
 
     def _sparse_fit(self, X, strategy, missing_values, fill_value):
         """Fit the transformer on sparse data."""
-        mask_data = _get_mask(X.data, missing_values)
-        n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
+        # Count the zeros
+        if missing_values == 0:
+            n_zeros_axis = np.zeros(X.shape[1], dtype=int)
+        else:
+            n_zeros_axis = X.shape[0] - np.diff(X.indptr)
 
-        statistics = np.empty(X.shape[1])
+        # Mean
+        if strategy == "mean":
+            if missing_values != 0:
+                n_non_missing = n_zeros_axis
+
+                # Mask the missing elements
+                mask_missing_values = _get_mask(X.data, missing_values)
+                mask_valids = np.logical_not(mask_missing_values)
+
+                # Sum only the valid elements
+                new_data = X.data.copy()
+                new_data[mask_missing_values] = 0
+                X = sparse.csc_matrix((new_data, X.indices, X.indptr),
+                                      copy=False)
+                sums = X.sum(axis=0)
+
+                # Count the elements != 0
+                mask_non_zeros = sparse.csc_matrix(
+                    (mask_valids.astype(np.float64),
+                     X.indices,
+                     X.indptr), copy=False)
+                s = mask_non_zeros.sum(axis=0)
+                n_non_missing = np.add(n_non_missing, s)
+
+            else:
+                sums = X.sum(axis=0)
+                n_non_missing = np.diff(X.indptr)
 
-        if strategy == "constant":
-            # for constant strategy, self.statistcs_ is used to store
-            # fill_value in each column
-            statistics.fill(fill_value)
+            # Ignore the error, columns with a np.nan statistics_
+            # are not an error at this point. These columns will
+            # be removed in transform
+            with np.errstate(all="ignore"):
+                return np.ravel(sums) / np.ravel(n_non_missing)
 
+        # Median + Most frequent + Constant
         else:
-            for i in range(X.shape[1]):
-                column = X.data[X.indptr[i]:X.indptr[i + 1]]
-                mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
-                column = column[~mask_column]
-
-                # combine explicit and implicit zeros
-                mask_zeros = _get_mask(column, 0)
-                column = column[~mask_zeros]
-                n_explicit_zeros = mask_zeros.sum()
-                n_zeros = n_implicit_zeros[i] + n_explicit_zeros
-
-                if strategy == "mean":
-                    s = column.size + n_zeros
-                    statistics[i] = np.nan if s == 0 else column.sum() / s
-
-                elif strategy == "median":
-                    statistics[i] = _get_median(column,
-                                                n_zeros)
-
-                elif strategy == "most_frequent":
-                    statistics[i] = _most_frequent(column,
-                                                   0,
-                                                   n_zeros)
-        return statistics
+            # Remove the missing values, for each column
+            columns_all = np.hsplit(X.data, X.indptr[1:-1])
+            mask_missing_values = _get_mask(X.data, missing_values)
+            mask_valids = np.hsplit(np.logical_not(mask_missing_values),
+                                    X.indptr[1:-1])
+
+            # astype necessary for bug in numpy.hsplit before v1.9
+            columns = [col[mask.astype(bool, copy=False)]
+                       for col, mask in zip(columns_all, mask_valids)]
+
+            # Median
+            if strategy == "median":
+                median = np.empty(len(columns))
+                for i, column in enumerate(columns):
+                    median[i] = _get_median(column, n_zeros_axis[i])
+
+                return median
+
+            # Most frequent
+            elif strategy == "most_frequent":
+                most_frequent = np.empty(len(columns))
+
+                for i, column in enumerate(columns):
+                    most_frequent[i] = _most_frequent(column,
+                                                      0,
+                                                      n_zeros_axis[i])
+
+                return most_frequent
+
+            # Constant
+            elif strategy == "constant":
+                return np.full(X.shape[1], fill_value)
 
     def _dense_fit(self, X, strategy, missing_values, fill_value):
         """Fit the transformer on dense data."""
@@ -359,8 +374,6 @@ def _dense_fit(self, X, strategy, missing_values, fill_value):
 
         # Constant
         elif strategy == "constant":
-            # for constant strategy, self.statistcs_ is used to store
-            # fill_value in each column
             return np.full(X.shape[1], fill_value, dtype=X.dtype)
 
     def transform(self, X):
@@ -368,7 +381,7 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
             The input data to complete.
         """
         check_is_fitted(self, 'statistics_')
@@ -399,19 +412,17 @@ def transform(self, X):
                 X = X[:, valid_statistics_indexes]
 
         # Do actual imputation
-        if sparse.issparse(X):
-            if self.missing_values == 0:
-                raise ValueError("Imputation not possible when missing_values "
-                                 "== 0 and input is sparse. Provide a dense "
-                                 "array instead.")
-            else:
-                mask = _get_mask(X.data, self.missing_values)
-                indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
-                                    np.diff(X.indptr))[mask]
+        if sparse.issparse(X) and self.missing_values != 0:
+            mask = _get_mask(X.data, self.missing_values)
+            indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
+                                np.diff(X.indptr))[mask]
 
-                X.data[mask] = valid_statistics[indexes].astype(X.dtype,
-                                                                copy=False)
+            X.data[mask] = valid_statistics[indexes].astype(X.dtype,
+                                                            copy=False)
         else:
+            if sparse.issparse(X):
+                X = X.toarray()
+
             mask = _get_mask(X, self.missing_values)
             n_missing = np.sum(mask, axis=0)
             values = np.repeat(valid_statistics, n_missing)
@@ -422,13 +433,14 @@ def transform(self, X):
         return X
 
 
-class IterativeImputer(BaseEstimator, TransformerMixin):
-    """Multivariate imputer that estimates each feature from all the others.
+class ChainedImputer(BaseEstimator, TransformerMixin):
+    """Chained imputer transformer to impute missing values.
 
-    A strategy for imputing missing values by modeling each feature with
-    missing values as a function of other features in a round-robin fashion.
+    Basic implementation of chained imputer from MICE (Multivariate
+    Imputations by Chained Equations) package from R. This version assumes all
+    of the features are Gaussian.
 
-    Read more in the :ref:`User Guide <iterative_imputer>`.
+    Read more in the :ref:`User Guide <mice>`.
 
     Parameters
     ----------
@@ -450,34 +462,24 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         "random"
             A random order for each round.
 
-    n_iter : int, optional (default=10)
-        Number of imputation rounds to perform before returning the imputations
-        computed during the final round. A round is a single imputation of each
-        feature with missing values.
+    n_imputations : int, optional (default=100)
+        Number of chained imputation rounds to perform, the results of which
+        will be used in the final average.
+
+    n_burn_in : int, optional (default=10)
+        Number of initial imputation rounds to perform the results of which
+        will not be returned.
 
-    predictor : estimator object, default=RidgeCV() or BayesianRidge()
+    predictor : estimator object, default=BayesianRidge()
         The predictor to use at each step of the round-robin imputation.
-        If ``sample_posterior`` is True, the predictor must support
-        ``return_std`` in its ``predict`` method. Also, if
-        ``sample_posterior=True`` the default predictor will be
-        :class:`sklearn.linear_model.BayesianRidge` and
-        :class:`sklearn.linear_model.RidgeCV` otherwise.
-
-    sample_posterior : boolean, default=False
-        Whether to sample from the (Gaussian) predictive posterior of the
-        fitted predictor for each imputation. Predictor must support
-        ``return_std`` in its ``predict`` method if set to ``True``. Set to
-        ``True`` if using ``IterativeImputer`` for multiple imputations.
+        It must support ``return_std`` in its ``predict`` method.
 
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
-        each feature column. Nearness between features is measured using
+        the each feature column. Nearness between features is measured using
         the absolute correlation coefficient between each feature pair (after
-        initial imputation). To ensure coverage of features throughout the
-        imputation process, the neighbor features are not necessarily nearest,
-        but are drawn with probability proportional to correlation for each
-        imputed target feature. Can provide significant speed-up when the
-        number of features is huge. If ``None``, all features will be used.
+        initial imputation). Can provide significant speed-up when the number
+        of features is huge. If ``None``, all features will be used.
 
     initial_strategy : str, optional (default="mean")
         Which strategy to use to initialize the missing values. Same as the
@@ -498,43 +500,37 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         or 2.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        The seed of the pseudo random number generator to use. Randomizes
-        selection of predictor features if n_nearest_features is not None, the
-        ``imputation_order`` if ``random``, and the sampling from posterior if
-        ``sample_posterior`` is True. Use an integer for determinism.
-        See :term:`the Glossary <random_state>`.
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by ``np.random``.
 
     Attributes
     ----------
-    initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
-        Imputer used to initialize the missing values.
+    initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`'
+        The imputer used to initialize the missing values.
 
     imputation_sequence_ : list of tuples
         Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where
         ``feat_idx`` is the current feature to be imputed,
         ``neighbor_feat_idx`` is the array of other features used to impute the
         current feature, and ``predictor`` is the trained predictor used for
-        the imputation. Length is ``self.n_features_with_missing_ * n_iter``.
-
-    n_features_with_missing_ : int
-        Number of features with missing values.
-
-    See also
-    --------
-    SimpleImputer : Univariate imputation of missing values.
+        the imputation.
 
     Notes
     -----
-    To support imputation in inductive mode we store each feature's predictor
-    during the ``fit`` phase, and predict without refitting (in order) during
-    the ``transform`` phase.
+    The R version of MICE does not have inductive functionality, i.e. first
+    fitting on ``X_train`` and then transforming any ``X_test`` without
+    additional fitting. We do this by storing each feature's predictor during
+    the round-robin ``fit`` phase, and predicting without refitting (in order)
+    during the ``transform`` phase.
 
     Features which contain all missing values at ``fit`` are discarded upon
     ``transform``.
 
-    Features with missing values during ``transform`` which did not have any
-    missing values during ``fit`` will be imputed with the initial imputation
-    method only.
+    Features with missing values in transform which did not have any missing
+    values in fit will be imputed with the initial imputation method only.
 
     References
     ----------
@@ -542,19 +538,14 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         Multivariate Imputation by Chained Equations in R". Journal of
         Statistical Software 45: 1-67.
         <https://www.jstatsoft.org/article/view/v045i03>`_
-
-    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
-        Multivariate Data Suitable for use with an Electronic Computer".
-        Journal of the Royal Statistical Society 22(2): 302-306.
-        <https://www.jstor.org/stable/2984099>`_
     """
 
     def __init__(self,
                  missing_values=np.nan,
                  imputation_order='ascending',
-                 n_iter=10,
+                 n_imputations=100,
+                 n_burn_in=10,
                  predictor=None,
-                 sample_posterior=False,
                  n_nearest_features=None,
                  initial_strategy="mean",
                  min_value=None,
@@ -564,9 +555,9 @@ def __init__(self,
 
         self.missing_values = missing_values
         self.imputation_order = imputation_order
-        self.n_iter = n_iter
+        self.n_imputations = n_imputations
+        self.n_burn_in = n_burn_in
         self.predictor = predictor
-        self.sample_posterior = sample_posterior
         self.n_nearest_features = n_nearest_features
         self.initial_strategy = initial_strategy
         self.min_value = min_value
@@ -604,8 +595,7 @@ def _impute_one_feature(self,
 
         predictor : object
             The predictor to use at this step of the round-robin imputation.
-            If ``sample_posterior`` is True, the predictor must support
-            ``return_std`` in its ``predict`` method.
+            It must support ``return_std`` in its ``predict`` method.
             If None, it will be cloned from self._predictor.
 
         fit_mode : boolean, default=True
@@ -644,34 +634,17 @@ def _impute_one_feature(self,
         # get posterior samples
         X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
                                missing_row_mask)
-        if self.sample_posterior:
-            mus, sigmas = predictor.predict(X_test, return_std=True)
-            good_sigmas = sigmas > 0
-            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
-            imputed_values[~good_sigmas] = mus[~good_sigmas]
-            mus = mus[good_sigmas]
-            sigmas = sigmas[good_sigmas]
-            a = (self._min_value - mus) / sigmas
-            b = (self._max_value - mus) / sigmas
-
-            if scipy.__version__ < LooseVersion('0.18'):
-                # bug with vector-valued `a` in old scipy
-                imputed_values[good_sigmas] = [
-                    stats.truncnorm(a=a_, b=b_,
-                                    loc=loc_, scale=scale_).rvs(
-                                        random_state=self.random_state_)
-                    for a_, b_, loc_, scale_
-                    in zip(a, b, mus, sigmas)]
-            else:
-                truncated_normal = stats.truncnorm(a=a, b=b,
-                                                   loc=mus, scale=sigmas)
-                imputed_values[good_sigmas] = truncated_normal.rvs(
-                    random_state=self.random_state_)
-        else:
-            imputed_values = predictor.predict(X_test)
-            imputed_values = np.clip(imputed_values,
-                                     self._min_value,
-                                     self._max_value)
+        mus, sigmas = predictor.predict(X_test, return_std=True)
+        good_sigmas = sigmas > 0
+        imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+        imputed_values[~good_sigmas] = mus[~good_sigmas]
+        imputed_values[good_sigmas] = self.random_state_.normal(
+            loc=mus[good_sigmas], scale=sigmas[good_sigmas])
+
+        # clip the values
+        imputed_values = np.clip(imputed_values,
+                                 self._min_value,
+                                 self._max_value)
 
         # update the feature
         X_filled[missing_row_mask, feat_idx] = imputed_values
@@ -862,51 +835,44 @@ def fit_transform(self, X, y=None):
         self.random_state_ = getattr(self, "random_state_",
                                      check_random_state(self.random_state))
 
-        if self.n_iter < 0:
-            raise ValueError(
-                "'n_iter' should be a positive integer. Got {} instead."
-                .format(self.n_iter))
-
         if self.predictor is None:
-            if self.sample_posterior:
-                from .linear_model import BayesianRidge
-                self._predictor = BayesianRidge()
-            else:
-                from .linear_model import RidgeCV
-                # including a very small alpha to approximate OLS
-                self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1,  1, 10]))
+            from .linear_model import BayesianRidge
+            self._predictor = BayesianRidge()
         else:
             self._predictor = clone(self.predictor)
 
-        if hasattr(self._predictor, 'random_state'):
-            self._predictor.random_state = self.random_state_
-
-        self._min_value = -np.inf if self.min_value is None else self.min_value
-        self._max_value = np.inf if self.max_value is None else self.max_value
+        self._min_value = np.nan if self.min_value is None else self.min_value
+        self._max_value = np.nan if self.max_value is None else self.max_value
 
         self.initial_imputer_ = None
-        X, Xt, mask_missing_values = self._initial_imputation(X)
+        X, X_filled, mask_missing_values = self._initial_imputation(X)
+
+        # edge case: in case the user specifies 0 for n_imputations,
+        # then there is no need to do burn in and the result should be
+        # just the initial imputation (before clipping)
+        if self.n_imputations < 1:
+            return X_filled
 
-        if self.n_iter == 0:
-            return Xt
+        X_filled = np.clip(X_filled, self._min_value, self._max_value)
 
         # order in which to impute
         # note this is probably too slow for large feature data (d > 100000)
         # and a better way would be good.
         # see: https://goo.gl/KyCNwj and subsequent comments
         ordered_idx = self._get_ordered_idx(mask_missing_values)
-        self.n_features_with_missing_ = len(ordered_idx)
 
-        abs_corr_mat = self._get_abs_corr_mat(Xt)
+        abs_corr_mat = self._get_abs_corr_mat(X_filled)
 
         # impute data
-        n_samples, n_features = Xt.shape
+        n_rounds = self.n_burn_in + self.n_imputations
+        n_samples, n_features = X_filled.shape
+        Xt = np.zeros((n_samples, n_features), dtype=X.dtype)
         self.imputation_sequence_ = []
         if self.verbose > 0:
-            print("[IterativeImputer] Completing matrix with shape %s"
+            print("[ChainedImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
-        for i_rnd in range(self.n_iter):
+        for i_rnd in range(n_rounds):
             if self.imputation_order == 'random':
                 ordered_idx = self._get_ordered_idx(mask_missing_values)
 
@@ -914,20 +880,22 @@ def fit_transform(self, X, y=None):
                 neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
                                                                 feat_idx,
                                                                 abs_corr_mat)
-                Xt, predictor = self._impute_one_feature(
-                    Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
+                X_filled, predictor = self._impute_one_feature(
+                    X_filled, mask_missing_values, feat_idx, neighbor_feat_idx,
                     predictor=None, fit_mode=True)
-                print(Xt)
                 predictor_triplet = ImputerTriplet(feat_idx,
                                                    neighbor_feat_idx,
                                                    predictor)
                 self.imputation_sequence_.append(predictor_triplet)
 
+            if i_rnd >= self.n_burn_in:
+                Xt += X_filled
             if self.verbose > 0:
-                print('[IterativeImputer] Ending imputation round '
+                print('[ChainedImputer] Ending imputation round '
                       '%d/%d, elapsed time %0.2f'
-                      % (i_rnd + 1, self.n_iter, time() - start_t))
+                      % (i_rnd + 1, n_rounds, time() - start_t))
 
+        Xt /= self.n_imputations
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
@@ -949,37 +917,44 @@ def transform(self, X):
         """
         check_is_fitted(self, 'initial_imputer_')
 
-        X, Xt, mask_missing_values = self._initial_imputation(X)
+        X, X_filled, mask_missing_values = self._initial_imputation(X)
 
-        if self.n_iter == 0:
-            return Xt
+        # edge case: in case the user specifies 0 for n_imputations,
+        # then there is no need to do burn in and the result should be
+        # just the initial imputation (before clipping)
+        if self.n_imputations < 1:
+            return X_filled
 
-        imputations_per_round = len(self.imputation_sequence_) // self.n_iter
+        X_filled = np.clip(X_filled, self._min_value, self._max_value)
+
+        n_rounds = self.n_burn_in + self.n_imputations
+        n_imputations = len(self.imputation_sequence_)
+        imputations_per_round = n_imputations // n_rounds
         i_rnd = 0
+        Xt = np.zeros(X.shape, dtype=X.dtype)
         if self.verbose > 0:
-            print("[IterativeImputer] Completing matrix with shape %s"
+            print("[ChainedImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
         for it, predictor_triplet in enumerate(self.imputation_sequence_):
-            Xt, _ = self._impute_one_feature(
-                Xt,
+            X_filled, _ = self._impute_one_feature(
+                X_filled,
                 mask_missing_values,
                 predictor_triplet.feat_idx,
                 predictor_triplet.neighbor_feat_idx,
                 predictor=predictor_triplet.predictor,
                 fit_mode=False
             )
-            print(it, 'int:', predictor_triplet.predictor.intercept_,
-                  'coef:', predictor_triplet.predictor.coef_,
-                  'alpha:', predictor_triplet.predictor.alpha_)
-            print('out:', Xt)
             if not (it + 1) % imputations_per_round:
+                if i_rnd >= self.n_burn_in:
+                    Xt += X_filled
                 if self.verbose > 1:
-                    print('[IterativeImputer] Ending imputation round '
+                    print('[ChainedImputer] Ending imputation round '
                           '%d/%d, elapsed time %0.2f'
-                          % (i_rnd + 1, self.n_iter, time() - start_t))
+                          % (i_rnd + 1, n_rounds, time() - start_t))
                 i_rnd += 1
 
+        Xt /= self.n_imputations
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
@@ -1006,18 +981,11 @@ def fit(self, X, y=None):
 class MissingIndicator(BaseEstimator, TransformerMixin):
     """Binary indicators for missing values.
 
-    Note that this component typically should not not be used in a vanilla
-    :class:`Pipeline` consisting of transformers and a classifier, but rather
-    could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
-
-    Read more in the :ref:`User Guide <impute>`.
-
     Parameters
     ----------
     missing_values : number, string, np.nan (default) or None
         The placeholder for the missing values. All occurrences of
-        `missing_values` will be indicated (True in the output array), the
-        other values will be marked as False.
+        `missing_values` will be imputed.
 
     features : str, optional
         Whether the imputer mask should represent all or a subset of
@@ -1038,7 +1006,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
     error_on_new : boolean, optional
         If True (default), transform will raise an error when there are
         features with missing values in transform that have no missing values
-        in fit. This is applicable only when ``features="missing-only"``.
+        in fit This is applicable only when ``features="missing-only"``.
 
     Attributes
     ----------
@@ -1058,7 +1026,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
     ...                [np.nan, 2, 3],
     ...                [2, 4, 0]])
     >>> indicator = MissingIndicator()
-    >>> indicator.fit(X1)  # doctest: +NORMALIZE_WHITESPACE
+    >>> indicator.fit(X1)
     MissingIndicator(error_on_new=True, features='missing-only',
              missing_values=nan, sparse='auto')
     >>> X2_tr = indicator.transform(X2)
@@ -1159,7 +1127,7 @@ def fit(self, X, y=None):
             raise ValueError("'features' has to be either 'missing-only' or "
                              "'all'. Got {} instead.".format(self.features))
 
-        if not ((isinstance(self.sparse, str) and
+        if not ((isinstance(self.sparse, six.string_types) and
                 self.sparse == "auto") or isinstance(self.sparse, bool)):
             raise ValueError("'sparse' has to be a boolean or 'auto'. "
                              "Got {!r} instead.".format(self.sparse))
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 124a75368b752..fd2bbd4ec5ad0 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -931,14 +931,3 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor,
 
     with pytest.raises(ValueError, match=err_msg):
         imputer.fit_transform(X)
-
-
-def test_heisenbug():
-    imp = IterativeImputer(n_iter=10, random_state=0)
-    imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
-    X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
-    assert_allclose(
-        np.round(imp.transform(X_test)),
-        [[1., 2.],
-         [6., 3.],
-         [26., 6.]])

From d8b7008d5ca73796634c8b58e602ed9ff3faddb6 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 23 Jan 2019 11:54:26 +1100
Subject: [PATCH 7/9] Resurrect after a git typo

---
 sklearn/impute.py | 405 ++++++++++++++++++++++++----------------------
 1 file changed, 216 insertions(+), 189 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index fec9d8b0d7a8d..f888d7fe83d4f 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -6,11 +6,13 @@
 from __future__ import division
 
 import warnings
-from time import time
 import numbers
+from time import time
+from distutils.version import LooseVersion
 
 import numpy as np
 import numpy.ma as ma
+import scipy
 from scipy import sparse
 from scipy import stats
 from collections import namedtuple
@@ -25,10 +27,6 @@
 from .utils.fixes import _object_dtype_isnan
 from .utils import is_scalar_nan
 
-from .externals import six
-
-zip = six.moves.zip
-map = six.moves.map
 
 ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx',
                                                'neighbor_feat_idx',
@@ -37,7 +35,7 @@
 __all__ = [
     'MissingIndicator',
     'SimpleImputer',
-    'ChainedImputer',
+    'IterativeImputer',
 ]
 
 
@@ -142,7 +140,6 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
         a new copy will always be made, even if `copy=False`:
 
         - If X is not an array of floating values;
-        - If X is sparse and `missing_values=0`;
         - If X is encoded as a CSR matrix.
 
     Attributes
@@ -150,6 +147,26 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     statistics_ : array of shape (n_features,)
         The imputation fill value for each feature.
 
+    See also
+    --------
+    IterativeImputer : Multivariate imputation of missing values.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.impute import SimpleImputer
+    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
+    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    SimpleImputer(copy=True, fill_value=None, missing_values=nan,
+           strategy='mean', verbose=0)
+    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
+    >>> print(imp_mean.transform(X))
+    ... # doctest: +NORMALIZE_WHITESPACE
+    [[ 7.   2.   3. ]
+     [ 4.   3.5  6. ]
+     [10.   3.5  9. ]]
+
     Notes
     -----
     Columns which only contained missing values at `fit` are discarded upon
@@ -237,10 +254,17 @@ def fit(self, X, y=None):
                              "data".format(fill_value))
 
         if sparse.issparse(X):
-            self.statistics_ = self._sparse_fit(X,
-                                                self.strategy,
-                                                self.missing_values,
-                                                fill_value)
+            # missing_values = 0 not allowed with sparse data as it would
+            # force densification
+            if self.missing_values == 0:
+                raise ValueError("Imputation not possible when missing_values "
+                                 "== 0 and input is sparse. Provide a dense "
+                                 "array instead.")
+            else:
+                self.statistics_ = self._sparse_fit(X,
+                                                    self.strategy,
+                                                    self.missing_values,
+                                                    fill_value)
         else:
             self.statistics_ = self._dense_fit(X,
                                                self.strategy,
@@ -251,80 +275,41 @@ def fit(self, X, y=None):
 
     def _sparse_fit(self, X, strategy, missing_values, fill_value):
         """Fit the transformer on sparse data."""
-        # Count the zeros
-        if missing_values == 0:
-            n_zeros_axis = np.zeros(X.shape[1], dtype=int)
-        else:
-            n_zeros_axis = X.shape[0] - np.diff(X.indptr)
+        mask_data = _get_mask(X.data, missing_values)
+        n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
 
-        # Mean
-        if strategy == "mean":
-            if missing_values != 0:
-                n_non_missing = n_zeros_axis
-
-                # Mask the missing elements
-                mask_missing_values = _get_mask(X.data, missing_values)
-                mask_valids = np.logical_not(mask_missing_values)
-
-                # Sum only the valid elements
-                new_data = X.data.copy()
-                new_data[mask_missing_values] = 0
-                X = sparse.csc_matrix((new_data, X.indices, X.indptr),
-                                      copy=False)
-                sums = X.sum(axis=0)
-
-                # Count the elements != 0
-                mask_non_zeros = sparse.csc_matrix(
-                    (mask_valids.astype(np.float64),
-                     X.indices,
-                     X.indptr), copy=False)
-                s = mask_non_zeros.sum(axis=0)
-                n_non_missing = np.add(n_non_missing, s)
-
-            else:
-                sums = X.sum(axis=0)
-                n_non_missing = np.diff(X.indptr)
+        statistics = np.empty(X.shape[1])
 
-            # Ignore the error, columns with a np.nan statistics_
-            # are not an error at this point. These columns will
-            # be removed in transform
-            with np.errstate(all="ignore"):
-                return np.ravel(sums) / np.ravel(n_non_missing)
+        if strategy == "constant":
+            # for constant strategy, self.statistcs_ is used to store
+            # fill_value in each column
+            statistics.fill(fill_value)
 
-        # Median + Most frequent + Constant
         else:
-            # Remove the missing values, for each column
-            columns_all = np.hsplit(X.data, X.indptr[1:-1])
-            mask_missing_values = _get_mask(X.data, missing_values)
-            mask_valids = np.hsplit(np.logical_not(mask_missing_values),
-                                    X.indptr[1:-1])
-
-            # astype necessary for bug in numpy.hsplit before v1.9
-            columns = [col[mask.astype(bool, copy=False)]
-                       for col, mask in zip(columns_all, mask_valids)]
-
-            # Median
-            if strategy == "median":
-                median = np.empty(len(columns))
-                for i, column in enumerate(columns):
-                    median[i] = _get_median(column, n_zeros_axis[i])
-
-                return median
-
-            # Most frequent
-            elif strategy == "most_frequent":
-                most_frequent = np.empty(len(columns))
-
-                for i, column in enumerate(columns):
-                    most_frequent[i] = _most_frequent(column,
-                                                      0,
-                                                      n_zeros_axis[i])
-
-                return most_frequent
-
-            # Constant
-            elif strategy == "constant":
-                return np.full(X.shape[1], fill_value)
+            for i in range(X.shape[1]):
+                column = X.data[X.indptr[i]:X.indptr[i + 1]]
+                mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
+                column = column[~mask_column]
+
+                # combine explicit and implicit zeros
+                mask_zeros = _get_mask(column, 0)
+                column = column[~mask_zeros]
+                n_explicit_zeros = mask_zeros.sum()
+                n_zeros = n_implicit_zeros[i] + n_explicit_zeros
+
+                if strategy == "mean":
+                    s = column.size + n_zeros
+                    statistics[i] = np.nan if s == 0 else column.sum() / s
+
+                elif strategy == "median":
+                    statistics[i] = _get_median(column,
+                                                n_zeros)
+
+                elif strategy == "most_frequent":
+                    statistics[i] = _most_frequent(column,
+                                                   0,
+                                                   n_zeros)
+        return statistics
 
     def _dense_fit(self, X, strategy, missing_values, fill_value):
         """Fit the transformer on dense data."""
@@ -374,6 +359,8 @@ def _dense_fit(self, X, strategy, missing_values, fill_value):
 
         # Constant
         elif strategy == "constant":
+            # for constant strategy, self.statistcs_ is used to store
+            # fill_value in each column
             return np.full(X.shape[1], fill_value, dtype=X.dtype)
 
     def transform(self, X):
@@ -381,7 +368,7 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             The input data to complete.
         """
         check_is_fitted(self, 'statistics_')
@@ -412,17 +399,19 @@ def transform(self, X):
                 X = X[:, valid_statistics_indexes]
 
         # Do actual imputation
-        if sparse.issparse(X) and self.missing_values != 0:
-            mask = _get_mask(X.data, self.missing_values)
-            indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
-                                np.diff(X.indptr))[mask]
+        if sparse.issparse(X):
+            if self.missing_values == 0:
+                raise ValueError("Imputation not possible when missing_values "
+                                 "== 0 and input is sparse. Provide a dense "
+                                 "array instead.")
+            else:
+                mask = _get_mask(X.data, self.missing_values)
+                indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
+                                    np.diff(X.indptr))[mask]
 
-            X.data[mask] = valid_statistics[indexes].astype(X.dtype,
-                                                            copy=False)
+                X.data[mask] = valid_statistics[indexes].astype(X.dtype,
+                                                                copy=False)
         else:
-            if sparse.issparse(X):
-                X = X.toarray()
-
             mask = _get_mask(X, self.missing_values)
             n_missing = np.sum(mask, axis=0)
             values = np.repeat(valid_statistics, n_missing)
@@ -433,14 +422,13 @@ def transform(self, X):
         return X
 
 
-class ChainedImputer(BaseEstimator, TransformerMixin):
-    """Chained imputer transformer to impute missing values.
+class IterativeImputer(BaseEstimator, TransformerMixin):
+    """Multivariate imputer that estimates each feature from all the others.
 
-    Basic implementation of chained imputer from MICE (Multivariate
-    Imputations by Chained Equations) package from R. This version assumes all
-    of the features are Gaussian.
+    A strategy for imputing missing values by modeling each feature with
+    missing values as a function of other features in a round-robin fashion.
 
-    Read more in the :ref:`User Guide <mice>`.
+    Read more in the :ref:`User Guide <iterative_imputer>`.
 
     Parameters
     ----------
@@ -462,24 +450,34 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         "random"
             A random order for each round.
 
-    n_imputations : int, optional (default=100)
-        Number of chained imputation rounds to perform, the results of which
-        will be used in the final average.
-
-    n_burn_in : int, optional (default=10)
-        Number of initial imputation rounds to perform the results of which
-        will not be returned.
+    n_iter : int, optional (default=10)
+        Number of imputation rounds to perform before returning the imputations
+        computed during the final round. A round is a single imputation of each
+        feature with missing values.
 
-    predictor : estimator object, default=BayesianRidge()
+    predictor : estimator object, default=RidgeCV() or BayesianRidge()
         The predictor to use at each step of the round-robin imputation.
-        It must support ``return_std`` in its ``predict`` method.
+        If ``sample_posterior`` is True, the predictor must support
+        ``return_std`` in its ``predict`` method. Also, if
+        ``sample_posterior=True`` the default predictor will be
+        :class:`sklearn.linear_model.BayesianRidge` and
+        :class:`sklearn.linear_model.RidgeCV` otherwise.
+
+    sample_posterior : boolean, default=False
+        Whether to sample from the (Gaussian) predictive posterior of the
+        fitted predictor for each imputation. Predictor must support
+        ``return_std`` in its ``predict`` method if set to ``True``. Set to
+        ``True`` if using ``IterativeImputer`` for multiple imputations.
 
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
-        the each feature column. Nearness between features is measured using
+        each feature column. Nearness between features is measured using
         the absolute correlation coefficient between each feature pair (after
-        initial imputation). Can provide significant speed-up when the number
-        of features is huge. If ``None``, all features will be used.
+        initial imputation). To ensure coverage of features throughout the
+        imputation process, the neighbor features are not necessarily nearest,
+        but are drawn with probability proportional to correlation for each
+        imputed target feature. Can provide significant speed-up when the
+        number of features is huge. If ``None``, all features will be used.
 
     initial_strategy : str, optional (default="mean")
         Which strategy to use to initialize the missing values. Same as the
@@ -500,37 +498,43 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         or 2.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        The seed of the pseudo random number generator to use when shuffling
-        the data.  If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by ``np.random``.
+        The seed of the pseudo random number generator to use. Randomizes
+        selection of predictor features if n_nearest_features is not None, the
+        ``imputation_order`` if ``random``, and the sampling from posterior if
+        ``sample_posterior`` is True. Use an integer for determinism.
+        See :term:`the Glossary <random_state>`.
 
     Attributes
     ----------
-    initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`'
-        The imputer used to initialize the missing values.
+    initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
+        Imputer used to initialize the missing values.
 
     imputation_sequence_ : list of tuples
         Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where
         ``feat_idx`` is the current feature to be imputed,
         ``neighbor_feat_idx`` is the array of other features used to impute the
         current feature, and ``predictor`` is the trained predictor used for
-        the imputation.
+        the imputation. Length is ``self.n_features_with_missing_ * n_iter``.
+
+    n_features_with_missing_ : int
+        Number of features with missing values.
+
+    See also
+    --------
+    SimpleImputer : Univariate imputation of missing values.
 
     Notes
     -----
-    The R version of MICE does not have inductive functionality, i.e. first
-    fitting on ``X_train`` and then transforming any ``X_test`` without
-    additional fitting. We do this by storing each feature's predictor during
-    the round-robin ``fit`` phase, and predicting without refitting (in order)
-    during the ``transform`` phase.
+    To support imputation in inductive mode we store each feature's predictor
+    during the ``fit`` phase, and predict without refitting (in order) during
+    the ``transform`` phase.
 
     Features which contain all missing values at ``fit`` are discarded upon
     ``transform``.
 
-    Features with missing values in transform which did not have any missing
-    values in fit will be imputed with the initial imputation method only.
+    Features with missing values during ``transform`` which did not have any
+    missing values during ``fit`` will be imputed with the initial imputation
+    method only.
 
     References
     ----------
@@ -538,14 +542,19 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         Multivariate Imputation by Chained Equations in R". Journal of
         Statistical Software 45: 1-67.
         <https://www.jstatsoft.org/article/view/v045i03>`_
+
+    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
+        Multivariate Data Suitable for use with an Electronic Computer".
+        Journal of the Royal Statistical Society 22(2): 302-306.
+        <https://www.jstor.org/stable/2984099>`_
     """
 
     def __init__(self,
                  missing_values=np.nan,
                  imputation_order='ascending',
-                 n_imputations=100,
-                 n_burn_in=10,
+                 n_iter=10,
                  predictor=None,
+                 sample_posterior=False,
                  n_nearest_features=None,
                  initial_strategy="mean",
                  min_value=None,
@@ -555,9 +564,9 @@ def __init__(self,
 
         self.missing_values = missing_values
         self.imputation_order = imputation_order
-        self.n_imputations = n_imputations
-        self.n_burn_in = n_burn_in
+        self.n_iter = n_iter
         self.predictor = predictor
+        self.sample_posterior = sample_posterior
         self.n_nearest_features = n_nearest_features
         self.initial_strategy = initial_strategy
         self.min_value = min_value
@@ -595,7 +604,8 @@ def _impute_one_feature(self,
 
         predictor : object
             The predictor to use at this step of the round-robin imputation.
-            It must support ``return_std`` in its ``predict`` method.
+            If ``sample_posterior`` is True, the predictor must support
+            ``return_std`` in its ``predict`` method.
             If None, it will be cloned from self._predictor.
 
         fit_mode : boolean, default=True
@@ -634,17 +644,34 @@ def _impute_one_feature(self,
         # get posterior samples
         X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
                                missing_row_mask)
-        mus, sigmas = predictor.predict(X_test, return_std=True)
-        good_sigmas = sigmas > 0
-        imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
-        imputed_values[~good_sigmas] = mus[~good_sigmas]
-        imputed_values[good_sigmas] = self.random_state_.normal(
-            loc=mus[good_sigmas], scale=sigmas[good_sigmas])
-
-        # clip the values
-        imputed_values = np.clip(imputed_values,
-                                 self._min_value,
-                                 self._max_value)
+        if self.sample_posterior:
+            mus, sigmas = predictor.predict(X_test, return_std=True)
+            good_sigmas = sigmas > 0
+            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+            imputed_values[~good_sigmas] = mus[~good_sigmas]
+            mus = mus[good_sigmas]
+            sigmas = sigmas[good_sigmas]
+            a = (self._min_value - mus) / sigmas
+            b = (self._max_value - mus) / sigmas
+
+            if scipy.__version__ < LooseVersion('0.18'):
+                # bug with vector-valued `a` in old scipy
+                imputed_values[good_sigmas] = [
+                    stats.truncnorm(a=a_, b=b_,
+                                    loc=loc_, scale=scale_).rvs(
+                                        random_state=self.random_state_)
+                    for a_, b_, loc_, scale_
+                    in zip(a, b, mus, sigmas)]
+            else:
+                truncated_normal = stats.truncnorm(a=a, b=b,
+                                                   loc=mus, scale=sigmas)
+                imputed_values[good_sigmas] = truncated_normal.rvs(
+                    random_state=self.random_state_)
+        else:
+            imputed_values = predictor.predict(X_test)
+            imputed_values = np.clip(imputed_values,
+                                     self._min_value,
+                                     self._max_value)
 
         # update the feature
         X_filled[missing_row_mask, feat_idx] = imputed_values
@@ -835,44 +862,51 @@ def fit_transform(self, X, y=None):
         self.random_state_ = getattr(self, "random_state_",
                                      check_random_state(self.random_state))
 
+        if self.n_iter < 0:
+            raise ValueError(
+                "'n_iter' should be a positive integer. Got {} instead."
+                .format(self.n_iter))
+
         if self.predictor is None:
-            from .linear_model import BayesianRidge
-            self._predictor = BayesianRidge()
+            if self.sample_posterior:
+                from .linear_model import BayesianRidge
+                self._predictor = BayesianRidge()
+            else:
+                from .linear_model import RidgeCV
+                # including a very small alpha to approximate OLS
+                self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1,  1, 10]))
         else:
             self._predictor = clone(self.predictor)
 
-        self._min_value = np.nan if self.min_value is None else self.min_value
-        self._max_value = np.nan if self.max_value is None else self.max_value
+        if hasattr(self._predictor, 'random_state'):
+            self._predictor.random_state = self.random_state_
 
-        self.initial_imputer_ = None
-        X, X_filled, mask_missing_values = self._initial_imputation(X)
+        self._min_value = -np.inf if self.min_value is None else self.min_value
+        self._max_value = np.inf if self.max_value is None else self.max_value
 
-        # edge case: in case the user specifies 0 for n_imputations,
-        # then there is no need to do burn in and the result should be
-        # just the initial imputation (before clipping)
-        if self.n_imputations < 1:
-            return X_filled
+        self.initial_imputer_ = None
+        X, Xt, mask_missing_values = self._initial_imputation(X)
 
-        X_filled = np.clip(X_filled, self._min_value, self._max_value)
+        if self.n_iter == 0:
+            return Xt
 
         # order in which to impute
         # note this is probably too slow for large feature data (d > 100000)
         # and a better way would be good.
         # see: https://goo.gl/KyCNwj and subsequent comments
         ordered_idx = self._get_ordered_idx(mask_missing_values)
+        self.n_features_with_missing_ = len(ordered_idx)
 
-        abs_corr_mat = self._get_abs_corr_mat(X_filled)
+        abs_corr_mat = self._get_abs_corr_mat(Xt)
 
         # impute data
-        n_rounds = self.n_burn_in + self.n_imputations
-        n_samples, n_features = X_filled.shape
-        Xt = np.zeros((n_samples, n_features), dtype=X.dtype)
+        n_samples, n_features = Xt.shape
         self.imputation_sequence_ = []
         if self.verbose > 0:
-            print("[ChainedImputer] Completing matrix with shape %s"
+            print("[IterativeImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
-        for i_rnd in range(n_rounds):
+        for i_rnd in range(self.n_iter):
             if self.imputation_order == 'random':
                 ordered_idx = self._get_ordered_idx(mask_missing_values)
 
@@ -880,22 +914,19 @@ def fit_transform(self, X, y=None):
                 neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
                                                                 feat_idx,
                                                                 abs_corr_mat)
-                X_filled, predictor = self._impute_one_feature(
-                    X_filled, mask_missing_values, feat_idx, neighbor_feat_idx,
+                Xt, predictor = self._impute_one_feature(
+                    Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
                     predictor=None, fit_mode=True)
                 predictor_triplet = ImputerTriplet(feat_idx,
                                                    neighbor_feat_idx,
                                                    predictor)
                 self.imputation_sequence_.append(predictor_triplet)
 
-            if i_rnd >= self.n_burn_in:
-                Xt += X_filled
             if self.verbose > 0:
-                print('[ChainedImputer] Ending imputation round '
+                print('[IterativeImputer] Ending imputation round '
                       '%d/%d, elapsed time %0.2f'
-                      % (i_rnd + 1, n_rounds, time() - start_t))
+                      % (i_rnd + 1, self.n_iter, time() - start_t))
 
-        Xt /= self.n_imputations
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
@@ -917,28 +948,20 @@ def transform(self, X):
         """
         check_is_fitted(self, 'initial_imputer_')
 
-        X, X_filled, mask_missing_values = self._initial_imputation(X)
+        X, Xt, mask_missing_values = self._initial_imputation(X)
 
-        # edge case: in case the user specifies 0 for n_imputations,
-        # then there is no need to do burn in and the result should be
-        # just the initial imputation (before clipping)
-        if self.n_imputations < 1:
-            return X_filled
+        if self.n_iter == 0:
+            return Xt
 
-        X_filled = np.clip(X_filled, self._min_value, self._max_value)
-
-        n_rounds = self.n_burn_in + self.n_imputations
-        n_imputations = len(self.imputation_sequence_)
-        imputations_per_round = n_imputations // n_rounds
+        imputations_per_round = len(self.imputation_sequence_) // self.n_iter
         i_rnd = 0
-        Xt = np.zeros(X.shape, dtype=X.dtype)
         if self.verbose > 0:
-            print("[ChainedImputer] Completing matrix with shape %s"
+            print("[IterativeImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
         for it, predictor_triplet in enumerate(self.imputation_sequence_):
-            X_filled, _ = self._impute_one_feature(
-                X_filled,
+            Xt, _ = self._impute_one_feature(
+                Xt,
                 mask_missing_values,
                 predictor_triplet.feat_idx,
                 predictor_triplet.neighbor_feat_idx,
@@ -946,15 +969,12 @@ def transform(self, X):
                 fit_mode=False
             )
             if not (it + 1) % imputations_per_round:
-                if i_rnd >= self.n_burn_in:
-                    Xt += X_filled
                 if self.verbose > 1:
-                    print('[ChainedImputer] Ending imputation round '
+                    print('[IterativeImputer] Ending imputation round '
                           '%d/%d, elapsed time %0.2f'
-                          % (i_rnd + 1, n_rounds, time() - start_t))
+                          % (i_rnd + 1, self.n_iter, time() - start_t))
                 i_rnd += 1
 
-        Xt /= self.n_imputations
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
@@ -981,11 +1001,18 @@ def fit(self, X, y=None):
 class MissingIndicator(BaseEstimator, TransformerMixin):
     """Binary indicators for missing values.
 
+    Note that this component typically should not not be used in a vanilla
+    :class:`Pipeline` consisting of transformers and a classifier, but rather
+    could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
+
+    Read more in the :ref:`User Guide <impute>`.
+
     Parameters
     ----------
     missing_values : number, string, np.nan (default) or None
         The placeholder for the missing values. All occurrences of
-        `missing_values` will be imputed.
+        `missing_values` will be indicated (True in the output array), the
+        other values will be marked as False.
 
     features : str, optional
         Whether the imputer mask should represent all or a subset of
@@ -1006,7 +1033,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
     error_on_new : boolean, optional
         If True (default), transform will raise an error when there are
         features with missing values in transform that have no missing values
-        in fit This is applicable only when ``features="missing-only"``.
+        in fit. This is applicable only when ``features="missing-only"``.
 
     Attributes
     ----------
@@ -1026,7 +1053,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
     ...                [np.nan, 2, 3],
     ...                [2, 4, 0]])
     >>> indicator = MissingIndicator()
-    >>> indicator.fit(X1)
+    >>> indicator.fit(X1)  # doctest: +NORMALIZE_WHITESPACE
     MissingIndicator(error_on_new=True, features='missing-only',
              missing_values=nan, sparse='auto')
     >>> X2_tr = indicator.transform(X2)
@@ -1127,7 +1154,7 @@ def fit(self, X, y=None):
             raise ValueError("'features' has to be either 'missing-only' or "
                              "'all'. Got {} instead.".format(self.features))
 
-        if not ((isinstance(self.sparse, six.string_types) and
+        if not ((isinstance(self.sparse, str) and
                 self.sparse == "auto") or isinstance(self.sparse, bool)):
             raise ValueError("'sparse' has to be a boolean or 'auto'. "
                              "Got {!r} instead.".format(self.sparse))

From 6268e27ffe119aee34fe4e8048c13ce13680942f Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 23 Jan 2019 13:15:00 +1100
Subject: [PATCH 8/9] More git management fails

---
 sklearn/cluster/tests/test_bicluster.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
index 40ab9f8961667..dd5e91c18c27e 100644
--- a/sklearn/cluster/tests/test_bicluster.py
+++ b/sklearn/cluster/tests/test_bicluster.py
@@ -232,25 +232,6 @@ def test_perfect_checkerboard():
                                  (rows, cols)), 1)
 
 
-def test_compare_svd_methods():
-    data = np.array([[-2, -4, 2],
-                     [-2, 1, 2],
-                     [4, 2, 5]])
-
-    model_rand = SpectralCoclustering(n_clusters=2,
-                                      svd_method='randomized',
-                                      random_state=0)
-    model_rand.fit(data)
-
-    model_arpack = SpectralCoclustering(n_clusters=2,
-                                        svd_method='arpack',
-                                        random_state=0)
-    model_arpack.fit(data)
-
-    assert_array_equal(model_rand.column_labels_,
-                       model_arpack.column_labels_)
-
-
 def test_errors():
     data = np.arange(25).reshape((5, 5))
 

From 5285e833155a41d3cdb0a42ad40b86e2dfa17a76 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 23 Jan 2019 21:43:04 +1100
Subject: [PATCH 9/9] Fixes to doctest

---
 doc/modules/impute.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 76d170e506c2a..45523d74fe9b8 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -106,16 +106,16 @@ round are returned.
     >>> import numpy as np
     >>> from sklearn.impute import IterativeImputer
     >>> imp = IterativeImputer(n_iter=10, random_state=0)
-    >>> imp.fit([[1, 2], [3, 6], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
+    >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])  # doctest: +NORMALIZE_WHITESPACE
     IterativeImputer(imputation_order='ascending', initial_strategy='mean',
         max_value=None, min_value=None, missing_values=nan, n_iter=10,
         n_nearest_features=None, predictor=None, random_state=0,
         sample_posterior=False, verbose=False)
-    # the model learns that the second feature is double the first
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
+    >>> # the model learns that the second feature is double the first
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]
-     [ 6.  12.]
+     [ 6. 12.]
      [ 3.  6.]]
 
 Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline