From ac8fdbd88217a67ca7d228d69c90eb1e6f976e51 Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Wed, 10 Jun 2015 14:22:56 -0400
Subject: [PATCH 01/17] add simple knn imputation/ only works for dense matrix

---
 sklearn/preprocessing/imputation.py           | 73 ++++++++++++++++---
 .../preprocessing/tests/test_imputation.py    | 66 ++++++++++++++++-
 2 files changed, 129 insertions(+), 10 deletions(-)

diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 0ef23c471bd60..cfcaef271c05c 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -8,6 +8,7 @@
 from scipy import sparse
 from scipy import stats
 
+from ..neighbors import KDTree, NearestNeighbors
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
 from ..utils import as_float_array
@@ -61,6 +62,7 @@ def _most_frequent(array, extra_value, n_repeat):
             return extra_value
 
 
+
 class Imputer(BaseEstimator, TransformerMixin):
     """Imputation transformer for completing missing values.
 
@@ -82,6 +84,8 @@ class Imputer(BaseEstimator, TransformerMixin):
           the axis.
         - If "most_frequent", then replace missing using the most frequent
           value along the axis.
+        - If "knn", then replace missing using the mean of the k-nearest neighbors
+          along the axis.
 
     axis : integer, optional (default=0)
         The axis along which to impute.
@@ -102,6 +106,10 @@ class Imputer(BaseEstimator, TransformerMixin):
         - If `axis=0` and X is encoded as a CSR matrix;
         - If `axis=1` and X is encoded as a CSC matrix.
 
+    kneighbor : int, optional (default=1)
+        It only has effect if the strategy is "knn". It controls the number of nearest
+        neighbors used to compute the mean along the axis.
+
     Attributes
     ----------
     statistics_ : array of shape (n_features,)
@@ -116,12 +124,13 @@ class Imputer(BaseEstimator, TransformerMixin):
       contain missing values).
     """
     def __init__(self, missing_values="NaN", strategy="mean",
-                 axis=0, verbose=0, copy=True):
+                 axis=0, verbose=0, copy=True, kneighbor=1):
         self.missing_values = missing_values
         self.strategy = strategy
         self.axis = axis
         self.verbose = verbose
         self.copy = copy
+        self.kneighbor = kneighbor
 
     def fit(self, X, y=None):
         """Fit the imputer on X.
@@ -138,7 +147,7 @@ def fit(self, X, y=None):
             Returns self.
         """
         # Check parameters
-        allowed_strategies = ["mean", "median", "most_frequent"]
+        allowed_strategies = ["mean", "median", "most_frequent", "knn"]
         if self.strategy not in allowed_strategies:
             raise ValueError("Can only use these strategies: {0} "
                              " got strategy={1}".format(allowed_strategies,
@@ -248,6 +257,10 @@ def _sparse_fit(self, X, strategy, missing_values, axis):
 
                 return most_frequent
 
+            elif strategy == "knn":
+                raise ValueError("Sparse matrix not supported!")
+
+
     def _dense_fit(self, X, strategy, missing_values, axis):
         """Fit the transformer on dense data."""
         X = check_array(X, force_all_finite=False)
@@ -299,6 +312,22 @@ def _dense_fit(self, X, strategy, missing_values, axis):
 
             return most_frequent
 
+        # KNN
+        elif strategy == "knn":
+
+            if axis == 1:
+                X = X.copy().transpose()
+
+            full_data = X[np.logical_not(mask.any(1))]
+            if full_data.size == 0:
+                raise ValueError("There is no row with complete data!")
+            if full_data.shape[0] < self.kneighbor:
+                raise ValueError("There are at most %d neighbors!" %(full_data.shape[0]))
+            if axis == 1:
+                full_data = full_data.transpose()
+
+            return full_data
+
     def transform(self, X):
         """Impute all missing values in X.
 
@@ -341,7 +370,9 @@ def transform(self, X):
         valid_mask = np.logical_not(invalid_mask)
         valid_statistics = statistics[valid_mask]
         valid_statistics_indexes = np.where(valid_mask)[0]
-        missing = np.arange(X.shape[not self.axis])[invalid_mask]
+
+        if self.strategy != "knn":
+            missing = np.arange(X.shape[not self.axis])[invalid_mask]
 
         if self.axis == 0 and invalid_mask.any():
             if self.verbose:
@@ -366,13 +397,37 @@ def transform(self, X):
 
             mask = _get_mask(X, self.missing_values)
             n_missing = np.sum(mask, axis=self.axis)
-            values = np.repeat(valid_statistics, n_missing)
 
-            if self.axis == 0:
-                coordinates = np.where(mask.transpose())[::-1]
+            if self.strategy == 'knn':
+                if self.axis == 1:
+                    X = X.transpose()
+                    mask = mask.transpose()
+                    statistics = statistics.transpose()
+                missing_index = np.where(mask.any(1))[0]
+                for i, row in zip(missing_index, X[missing_index]):
+                    col_index = np.where(np.logical_not(np.isnan(row)))[0]
+                    impute_index = np.where(np.isnan(row))[0]
+                    neigh = NearestNeighbors(self.kneighbor)
+                    neigh = neigh.fit(statistics[:, col_index])
+                    _dist, ind = neigh.kneighbors(row[np.logical_not(np.isnan(row))],
+                                           self.kneighbor)
+                    #tree = KDTree(statistics[:, col_index])
+                    #dist, ind = tree.query(row[np.logical_not(np.isnan(row))],
+                    #                       k=self.kneighbor)
+                    nn_index = ind[0]
+                    X[i][impute_index] = statistics[nn_index][:, impute_index].mean(0)
+
+                if self.axis == 1:
+                    X = X.transpose()
+
             else:
-                coordinates = mask
+                values = np.repeat(valid_statistics, n_missing)
+
+                if self.axis == 0:
+                    coordinates = np.where(mask.transpose())[::-1]
+                else:
+                    coordinates = mask
 
-            X[coordinates] = values
+                X[coordinates] = values
 
-        return X
+        return X
\ No newline at end of file
diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py
index bfcfc2a753b6a..fbd228367bbab 100644
--- a/sklearn/preprocessing/tests/test_imputation.py
+++ b/sklearn/preprocessing/tests/test_imputation.py
@@ -4,6 +4,7 @@
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import assert_false
 from sklearn.utils.testing import assert_true
 
@@ -15,7 +16,7 @@
 
 
 def _check_statistics(X, X_true,
-                      strategy, statistics, missing_values):
+                      strategy, statistics, missing_values, kneighbor=1):
     """Utility function for testing imputation for a given strategy.
 
     Test:
@@ -345,3 +346,66 @@ def test_imputation_copy():
 
     # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is
     # made, even if copy=False.
+
+def test_imputation_knn():
+    # Test imputation using knn strategy.
+    X = np.array([
+        [np.nan, -1,  0,  5],
+        [0,  2, -1,  3],
+        [-1,  -1,  0, 5],
+        [-1,  2,  3,  7],
+    ])
+
+    X2 = np.array([
+        [np.nan, -1,  0,  np.nan],
+        [0,  2, -1,  3],
+        [-1,  -1,  0, 5],
+        [-1,  2,  3,  7],
+    ])
+
+    X3 = np.array([
+        [np.nan, -1,  0,  5],
+        [0,  np.nan, -1,  3],
+        [-1,  -1,  np.nan, 5],
+        [-1,  2,  3,  np.nan],
+    ])
+
+    X_true_1 = np.array([
+        [-1, -1,  0,  5],
+        [0,  2, -1,  3],
+        [-1,  -1,  0, 5],
+        [-1,  2,  3,  7],
+    ])
+
+    X_true_2 = np.array([
+        [-0.5, -1,  0,  5],
+        [0,  2, -1,  3],
+        [-1,  -1,  0, 5],
+        [-1,  2,  3,  7],
+    ])
+
+
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=1)
+    X_impute = imputer.fit(X).transform(X)
+    assert_array_equal(X_true_1, X_impute)
+
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=1, kneighbor=1)
+    X_impute = imputer.fit(X.transpose()).transform(X.transpose())
+    assert_array_equal(X_true_1.transpose(), X_impute)
+
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=2)
+    X_impute = imputer.fit(X).transform(X)
+    assert_array_equal(X_true_2, X_impute)
+
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=1)
+    X_impute = imputer.fit(X2).transform(X2)
+    assert_array_equal(X_true_1, X_impute)
+
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0)
+    msg = "There is no row with complete data!"
+    assert_raise_message(ValueError, msg, imputer.fit, X3)
+
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=4)
+    msg = "There are at most 3 neighbors!"
+    assert_raise_message(ValueError, msg, imputer.fit, X)
+

From de2182d4750ad800d0a529c2aaeccee8cf76da93 Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Thu, 11 Jun 2015 13:40:20 -0400
Subject: [PATCH 02/17] add examples for knn imputation, fix error messages,
 add possible euclidean calculation method and add some tests

---
 examples/missing_values.py                    | 45 +++++++++++------
 sklearn/preprocessing/imputation.py           | 49 ++++++++++++-------
 .../preprocessing/tests/test_imputation.py    | 34 ++++++++++---
 3 files changed, 86 insertions(+), 42 deletions(-)

diff --git a/examples/missing_values.py b/examples/missing_values.py
index 59444b36490e3..8952590e7dcfb 100644
--- a/examples/missing_values.py
+++ b/examples/missing_values.py
@@ -8,23 +8,24 @@
 Imputing does not always improve the predictions, so please check via cross-validation.
 Sometimes dropping rows or using marker values is more effective.
 
-Missing values can be replaced by the mean, the median or the most frequent
-value using the ``strategy`` hyper-parameter.
+Missing values can be replaced by the mean, the median, the most frequent
+value or the mean of values of k-nearest neighbors using the ``strategy`` hyper-parameter.
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
 Script output::
 
-  Score with the entire dataset = 0.56
-  Score without the samples containing missing values = 0.48
-  Score after imputation of the missing values = 0.55
+  Score with the entire dataset = 0.43
+  Score without the samples containing missing values = 0.36
+  Score after mean imputation of the missing values = 0.42
+  Score after knn imputation with 10 neighbors of the missing values = 0.43
 
 In this case, imputing helps the classifier get close to the original score.
-  
+
 """
 import numpy as np
 
-from sklearn.datasets import load_boston
+from sklearn.datasets import load_diabetes
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import Imputer
@@ -32,7 +33,7 @@
 
 rng = np.random.RandomState(0)
 
-dataset = load_boston()
+dataset = load_diabetes()
 X_full, y_full = dataset.data, dataset.target
 n_samples = X_full.shape[0]
 n_features = X_full.shape[1]
@@ -42,15 +43,18 @@
 score = cross_val_score(estimator, X_full, y_full).mean()
 print("Score with the entire dataset = %.2f" % score)
 
-# Add missing values in 75% of the lines
-missing_rate = 0.75
+# Add missing values in 60% of the lines
+missing_rate = 0.60 # 60% of samples have missing value
+missing2_rate = 0.80 # 80% of samples with missing value have 2 missing features
 n_missing_samples = np.floor(n_samples * missing_rate)
+n_missing2_samples = np.floor(n_samples * missing_rate * missing2_rate)
 missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                       dtype=np.bool),
                              np.ones(n_missing_samples,
                                      dtype=np.bool)))
 rng.shuffle(missing_samples)
 missing_features = rng.randint(0, n_features, n_missing_samples)
+missing2_features = rng.randint(0, n_features, n_missing2_samples)
 
 # Estimate the score without the lines containing missing values
 X_filtered = X_full[~missing_samples, :]
@@ -59,14 +63,25 @@
 score = cross_val_score(estimator, X_filtered, y_filtered).mean()
 print("Score without the samples containing missing values = %.2f" % score)
 
-# Estimate the score after imputation of the missing values
+# Estimate the score after mean imputation of the missing values
+missing_index = np.where(missing_samples)[0]
+missing2_index = np.random.choice(missing_index, n_missing2_samples)
 X_missing = X_full.copy()
-X_missing[np.where(missing_samples)[0], missing_features] = 0
+X_missing[np.where(missing_samples)[0], missing_features] = np.nan
+X_missing[missing2_index, missing2_features] = np.nan
 y_missing = y_full.copy()
-estimator = Pipeline([("imputer", Imputer(missing_values=0,
-                                          strategy="mean",
+estimator = Pipeline([("imputer", Imputer(strategy="mean",
                                           axis=0)),
                       ("forest", RandomForestRegressor(random_state=0,
                                                        n_estimators=100))])
 score = cross_val_score(estimator, X_missing, y_missing).mean()
-print("Score after imputation of the missing values = %.2f" % score)
+print("Score after mean imputation of the missing values = %.2f" % score)
+
+# Estimate the score after knn imputation of the missing values
+neigh = 7
+estimator2 = Pipeline([("imputer", Imputer(strategy="knn",
+                                           axis=0, n_neighbors=neigh)),
+                      ("forest", RandomForestRegressor(random_state=0,
+                                                       n_estimators=100))])
+score = cross_val_score(estimator2, X_missing, y_missing).mean()
+print("Score after knn imputation with %d neighbors of the missing values = %.2f" % (neigh, score))
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index cfcaef271c05c..90159dfb11215 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -106,7 +106,7 @@ class Imputer(BaseEstimator, TransformerMixin):
         - If `axis=0` and X is encoded as a CSR matrix;
         - If `axis=1` and X is encoded as a CSC matrix.
 
-    kneighbor : int, optional (default=1)
+    n_neighbors : int, optional (default=1)
         It only has effect if the strategy is "knn". It controls the number of nearest
         neighbors used to compute the mean along the axis.
 
@@ -124,13 +124,13 @@ class Imputer(BaseEstimator, TransformerMixin):
       contain missing values).
     """
     def __init__(self, missing_values="NaN", strategy="mean",
-                 axis=0, verbose=0, copy=True, kneighbor=1):
+                 axis=0, verbose=0, copy=True, n_neighbors=1):
         self.missing_values = missing_values
         self.strategy = strategy
         self.axis = axis
         self.verbose = verbose
         self.copy = copy
-        self.kneighbor = kneighbor
+        self.n_neighbors = n_neighbors
 
     def fit(self, X, y=None):
         """Fit the imputer on X.
@@ -258,7 +258,7 @@ def _sparse_fit(self, X, strategy, missing_values, axis):
                 return most_frequent
 
             elif strategy == "knn":
-                raise ValueError("Sparse matrix not supported!")
+                raise ValueError("strategy='knn' does not support sparse matrix input")
 
 
     def _dense_fit(self, X, strategy, missing_values, axis):
@@ -320,9 +320,9 @@ def _dense_fit(self, X, strategy, missing_values, axis):
 
             full_data = X[np.logical_not(mask.any(1))]
             if full_data.size == 0:
-                raise ValueError("There is no row with complete data!")
-            if full_data.shape[0] < self.kneighbor:
-                raise ValueError("There are at most %d neighbors!" %(full_data.shape[0]))
+                raise ValueError("There is no sample with complete data.")
+            if full_data.shape[0] < self.n_neighbors:
+                raise ValueError("There are only %d complete samples, but n_neighbors=%d." %(full_data.shape[0], self.n_neighbors))
             if axis == 1:
                 full_data = full_data.transpose()
 
@@ -404,18 +404,29 @@ def transform(self, X):
                     mask = mask.transpose()
                     statistics = statistics.transpose()
                 missing_index = np.where(mask.any(1))[0]
-                for i, row in zip(missing_index, X[missing_index]):
-                    col_index = np.where(np.logical_not(np.isnan(row)))[0]
-                    impute_index = np.where(np.isnan(row))[0]
-                    neigh = NearestNeighbors(self.kneighbor)
-                    neigh = neigh.fit(statistics[:, col_index])
-                    _dist, ind = neigh.kneighbors(row[np.logical_not(np.isnan(row))],
-                                           self.kneighbor)
-                    #tree = KDTree(statistics[:, col_index])
-                    #dist, ind = tree.query(row[np.logical_not(np.isnan(row))],
-                    #                       k=self.kneighbor)
-                    nn_index = ind[0]
-                    X[i][impute_index] = statistics[nn_index][:, impute_index].mean(0)
+                if True:
+                    for i, row in zip(missing_index, X[missing_index]):
+                        col_na_mask = np.isnan(row)
+                        col_full_mask = np.logical_not(col_na_mask)
+                        col_index = np.where(col_full_mask)[0]
+                        impute_index = np.where(col_na_mask)[0]
+                        neigh = NearestNeighbors(self.n_neighbors)
+                        neigh = neigh.fit(statistics[:, col_index])
+                        _dist, ind = neigh.kneighbors(row[col_full_mask],
+                                                       self.n_neighbors)
+                        nn_index = ind[0]
+                        X[i][impute_index] = statistics[nn_index][:, impute_index].mean(0)
+                else:
+
+                    #@jnothman 's method
+
+                    D2 = (X[missing_index, np.newaxis] - statistics) ** 2
+                    D2[np.isnan(D2)] = 0
+                    missing_row, missing_col = np.where(np.isnan(X))
+                    sqdist = D2.sum(axis=2)
+                    ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
+                    means = np.mean(statistics[ind], axis=1)
+                    X[missing_row, missing_col] = means[np.where(np.isnan(X[missing_index]))[0], missing_col]
 
                 if self.axis == 1:
                     X = X.transpose()
diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py
index fbd228367bbab..e79c4e1375bb7 100644
--- a/sklearn/preprocessing/tests/test_imputation.py
+++ b/sklearn/preprocessing/tests/test_imputation.py
@@ -16,7 +16,7 @@
 
 
 def _check_statistics(X, X_true,
-                      strategy, statistics, missing_values, kneighbor=1):
+                      strategy, statistics, missing_values):
     """Utility function for testing imputation for a given strategy.
 
     Test:
@@ -370,6 +370,13 @@ def test_imputation_knn():
         [-1,  2,  3,  np.nan],
     ])
 
+    X4 = np.array([
+        [np.nan, -1,  0,  5],
+        [np.nan,  2, -1,  3],
+        [-1,  -1,  0, 5],
+        [0,  2,  -1,  6],
+    ])
+
     X_true_1 = np.array([
         [-1, -1,  0,  5],
         [0,  2, -1,  3],
@@ -384,28 +391,39 @@ def test_imputation_knn():
         [-1,  2,  3,  7],
     ])
 
+    X_true_4 = np.array([
+        [-1, -1,  0,  5],
+        [0,  2, -1,  3],
+        [-1,  -1,  0, 5],
+        [0,  2, -1,  6],
+    ])
 
-    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=1)
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1)
     X_impute = imputer.fit(X).transform(X)
     assert_array_equal(X_true_1, X_impute)
 
-    imputer = Imputer(missing_values='NaN', strategy="knn", axis=1, kneighbor=1)
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=1, n_neighbors=1)
     X_impute = imputer.fit(X.transpose()).transform(X.transpose())
     assert_array_equal(X_true_1.transpose(), X_impute)
 
-    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=2)
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=2)
     X_impute = imputer.fit(X).transform(X)
     assert_array_equal(X_true_2, X_impute)
 
-    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=1)
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1)
     X_impute = imputer.fit(X2).transform(X2)
     assert_array_equal(X_true_1, X_impute)
 
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1)
+    X_impute = imputer.fit(X4).transform(X4)
+    assert_array_equal(X_true_4, X_impute)
+
     imputer = Imputer(missing_values='NaN', strategy="knn", axis=0)
-    msg = "There is no row with complete data!"
+    msg = "There is no sample with complete data."
     assert_raise_message(ValueError, msg, imputer.fit, X3)
 
-    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=4)
-    msg = "There are at most 3 neighbors!"
+    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=4)
+    msg = "There are only 3 complete samples, but n_neighbors=4."
     assert_raise_message(ValueError, msg, imputer.fit, X)
 
+

From 160809785977354f941a63cf7db72d6dfd5f35ae Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Thu, 11 Jun 2015 14:46:31 -0400
Subject: [PATCH 03/17] change to block query

---
 sklearn/preprocessing/imputation.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 90159dfb11215..1cbaade0d3e3b 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -15,7 +15,7 @@
 from ..utils.fixes import astype
 from ..utils.sparsefuncs import _get_median
 from ..utils.validation import check_is_fitted
-
+from ..utils import gen_batches
 from ..externals import six
 
 zip = six.moves.zip
@@ -404,7 +404,7 @@ def transform(self, X):
                     mask = mask.transpose()
                     statistics = statistics.transpose()
                 missing_index = np.where(mask.any(1))[0]
-                if True:
+                if False:
                     for i, row in zip(missing_index, X[missing_index]):
                         col_na_mask = np.isnan(row)
                         col_full_mask = np.logical_not(col_na_mask)
@@ -419,14 +419,19 @@ def transform(self, X):
                 else:
 
                     #@jnothman 's method
-
-                    D2 = (X[missing_index, np.newaxis] - statistics) ** 2
-                    D2[np.isnan(D2)] = 0
-                    missing_row, missing_col = np.where(np.isnan(X))
-                    sqdist = D2.sum(axis=2)
-                    ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
-                    means = np.mean(statistics[ind], axis=1)
-                    X[missing_row, missing_col] = means[np.where(np.isnan(X[missing_index]))[0], missing_col]
+                    for sl in list(gen_batches(len(missing_index),100)):
+                        index_start, index_stop = missing_index[sl][0],missing_index[sl][-1]+1
+                        X_sl = X[index_start: index_stop].copy()
+                        mask_sl = _get_mask(X_sl, self.missing_values)
+                        missing_index_sl = np.where(mask_sl.any(1))[0]
+                        D2 = (X_sl[missing_index_sl, np.newaxis] - statistics) ** 2
+                        D2[np.isnan(D2)] = 0
+                        missing_row, missing_col = np.where(np.isnan(X_sl))
+                        sqdist = D2.sum(axis=2)
+                        ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
+                        means = np.mean(statistics[ind], axis=1)
+                        X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl[missing_index_sl]))[0], missing_col]
+                        X[index_start: index_stop] = X_sl
 
                 if self.axis == 1:
                     X = X.transpose()

From 4ff1dd7e20f9f8c699706bc542999e7672cf349b Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Mon, 15 Jun 2015 16:11:27 -0400
Subject: [PATCH 04/17] fix doc fix numpy compatibility;TODO groupby missing
 features, speed comparision, examples

---
 doc/modules/preprocessing.rst                 |  8 +++----
 sklearn/preprocessing/imputation.py           | 21 ++++++++++---------
 .../preprocessing/tests/test_imputation.py    |  2 --
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index a25fd9fb49b3b..cc0cf59dd5f2b 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -399,8 +399,8 @@ that contain the missing values::
     >>> import numpy as np
     >>> from sklearn.preprocessing import Imputer
     >>> imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
-    >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]])
-    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
+    >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]]) # doctest: +NORMALIZE_WHITESPACE
+    Imputer(axis=0, copy=True, missing_values='NaN', n_neighbors=1, strategy='mean', verbose=0)
     >>> X = [[np.nan, 2], [6, np.nan], [7, 6]]
     >>> print(imp.transform(X))                           # doctest: +ELLIPSIS
     [[ 4.          2.        ]
@@ -412,8 +412,8 @@ The :class:`Imputer` class also supports sparse matrices::
     >>> import scipy.sparse as sp
     >>> X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])
     >>> imp = Imputer(missing_values=0, strategy='mean', axis=0)
-    >>> imp.fit(X)
-    Imputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0)
+    >>> imp.fit(X) # doctest: +NORMALIZE_WHITESPACE
+    Imputer(axis=0, copy=True, missing_values=0, n_neighbors=1, strategy='mean', verbose=0)
     >>> X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])
     >>> print(imp.transform(X_test))                      # doctest: +ELLIPSIS
     [[ 4.          2.        ]
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 1cbaade0d3e3b..7f8629a45c551 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -8,7 +8,7 @@
 from scipy import sparse
 from scipy import stats
 
-from ..neighbors import KDTree, NearestNeighbors
+from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
 from ..utils import as_float_array
@@ -322,7 +322,8 @@ def _dense_fit(self, X, strategy, missing_values, axis):
             if full_data.size == 0:
                 raise ValueError("There is no sample with complete data.")
             if full_data.shape[0] < self.n_neighbors:
-                raise ValueError("There are only %d complete samples, but n_neighbors=%d." %(full_data.shape[0], self.n_neighbors))
+                raise ValueError("There are only %d complete samples, but n_neighbors=%d."
+                                 % (full_data.shape[0], self.n_neighbors))
             if axis == 1:
                 full_data = full_data.transpose()
 
@@ -413,25 +414,25 @@ def transform(self, X):
                         neigh = NearestNeighbors(self.n_neighbors)
                         neigh = neigh.fit(statistics[:, col_index])
                         _dist, ind = neigh.kneighbors(row[col_full_mask],
-                                                       self.n_neighbors)
+                                                      self.n_neighbors)
                         nn_index = ind[0]
                         X[i][impute_index] = statistics[nn_index][:, impute_index].mean(0)
-                else:
+                elif True:
 
                     #@jnothman 's method
-                    for sl in list(gen_batches(len(missing_index),100)):
-                        index_start, index_stop = missing_index[sl][0],missing_index[sl][-1]+1
-                        X_sl = X[index_start: index_stop].copy()
+                    for sl in list(gen_batches(len(missing_index), 100)):
+                        index_start, index_stop = missing_index[sl][0], missing_index[sl][-1]+1
+                        X_sl = X[index_start: index_stop]
                         mask_sl = _get_mask(X_sl, self.missing_values)
                         missing_index_sl = np.where(mask_sl.any(1))[0]
-                        D2 = (X_sl[missing_index_sl, np.newaxis] - statistics) ** 2
+                        D2 = (X_sl[missing_index_sl, np.newaxis, :] - statistics) ** 2
                         D2[np.isnan(D2)] = 0
                         missing_row, missing_col = np.where(np.isnan(X_sl))
                         sqdist = D2.sum(axis=2)
                         ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
                         means = np.mean(statistics[ind], axis=1)
-                        X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl[missing_index_sl]))[0], missing_col]
-                        X[index_start: index_stop] = X_sl
+                        X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl[missing_index_sl]))[0],
+                                                               missing_col]
 
                 if self.axis == 1:
                     X = X.transpose()
diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py
index e79c4e1375bb7..e66d12e16b58d 100644
--- a/sklearn/preprocessing/tests/test_imputation.py
+++ b/sklearn/preprocessing/tests/test_imputation.py
@@ -425,5 +425,3 @@ def test_imputation_knn():
     imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=4)
     msg = "There are only 3 complete samples, but n_neighbors=4."
     assert_raise_message(ValueError, msg, imputer.fit, X)
-
-

From 0b2cdf7e2d5bbb15547b37070f55c8914f730a32 Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Mon, 15 Jun 2015 16:37:50 -0400
Subject: [PATCH 05/17] fix numpy compatibility again

---
 sklearn/preprocessing/imputation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 7f8629a45c551..2dd6df3cab39b 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -425,7 +425,7 @@ def transform(self, X):
                         X_sl = X[index_start: index_stop]
                         mask_sl = _get_mask(X_sl, self.missing_values)
                         missing_index_sl = np.where(mask_sl.any(1))[0]
-                        D2 = (X_sl[missing_index_sl, np.newaxis, :] - statistics) ** 2
+                        D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2
                         D2[np.isnan(D2)] = 0
                         missing_row, missing_col = np.where(np.isnan(X_sl))
                         sqdist = D2.sum(axis=2)

From 1b731cf0b4ba5f806f127c8a27c175c05045a720 Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Tue, 16 Jun 2015 11:09:10 -0400
Subject: [PATCH 06/17] groupby missing feature, fix circular import

---
 sklearn/preprocessing/imputation.py | 41 ++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 2dd6df3cab39b..43700907907bc 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -2,13 +2,12 @@
 # License: BSD 3 clause
 
 import warnings
-
+import itertools
 import numpy as np
 import numpy.ma as ma
 from scipy import sparse
 from scipy import stats
 
-from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
 from ..utils import as_float_array
@@ -404,21 +403,8 @@ def transform(self, X):
                     X = X.transpose()
                     mask = mask.transpose()
                     statistics = statistics.transpose()
-                missing_index = np.where(mask.any(1))[0]
                 if False:
-                    for i, row in zip(missing_index, X[missing_index]):
-                        col_na_mask = np.isnan(row)
-                        col_full_mask = np.logical_not(col_na_mask)
-                        col_index = np.where(col_full_mask)[0]
-                        impute_index = np.where(col_na_mask)[0]
-                        neigh = NearestNeighbors(self.n_neighbors)
-                        neigh = neigh.fit(statistics[:, col_index])
-                        _dist, ind = neigh.kneighbors(row[col_full_mask],
-                                                      self.n_neighbors)
-                        nn_index = ind[0]
-                        X[i][impute_index] = statistics[nn_index][:, impute_index].mean(0)
-                elif True:
-
+                    missing_index = np.where(mask.any(1))[0]
                     #@jnothman 's method
                     for sl in list(gen_batches(len(missing_index), 100)):
                         index_start, index_stop = missing_index[sl][0], missing_index[sl][-1]+1
@@ -433,6 +419,29 @@ def transform(self, X):
                         means = np.mean(statistics[ind], axis=1)
                         X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl[missing_index_sl]))[0],
                                                                missing_col]
+                else:
+                    # group by missing features and batch within group
+                    group_index = np.unique(mask.astype('u1').view((np.void, X.shape[1])), return_inverse=True)[1]
+                    for group_number in range(max(group_index)+1):
+                        if group_number == 0:
+                            continue
+                        else:
+                            missing_index = np.where(group_index == group_number)[0]
+                            batch_slice = list(gen_batches(len(missing_index), 100))
+                            for sl in batch_slice:
+                                index_sl = missing_index[sl]
+                                X_sl = X[index_sl]
+                                D2 = (X_sl[:][:, np.newaxis, :] - statistics) ** 2
+                                D2[np.isnan(D2)] = 0
+                                missing_row, missing_col = np.where(np.isnan(X_sl))
+                                sqdist = D2.sum(axis=2)
+                                ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
+                                means = np.mean(statistics[ind], axis=1)
+                                X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0],
+                                                                       missing_col]
+                                X[index_sl] = X_sl
+
+
 
                 if self.axis == 1:
                     X = X.transpose()

From deb8c80986d4c3c417b04fd9996d09f3e43d78d1 Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Tue, 16 Jun 2015 14:31:34 -0400
Subject: [PATCH 07/17] choose batchsize

---
 sklearn/preprocessing/imputation.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 43700907907bc..c90d1fa7f5cd4 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -2,7 +2,6 @@
 # License: BSD 3 clause
 
 import warnings
-import itertools
 import numpy as np
 import numpy.ma as ma
 from scipy import sparse
@@ -403,15 +402,22 @@ def transform(self, X):
                     X = X.transpose()
                     mask = mask.transpose()
                     statistics = statistics.transpose()
+
+                batch_size = 10  # set batch size for block query
                 if False:
                     missing_index = np.where(mask.any(1))[0]
                     #@jnothman 's method
-                    for sl in list(gen_batches(len(missing_index), 100)):
+                    for sl in list(gen_batches(len(missing_index), batch_size)):
                         index_start, index_stop = missing_index[sl][0], missing_index[sl][-1]+1
                         X_sl = X[index_start: index_stop]
                         mask_sl = _get_mask(X_sl, self.missing_values)
                         missing_index_sl = np.where(mask_sl.any(1))[0]
-                        D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2
+                        t1 = time()
+                        fancy_index = X_sl[missing_index_sl][:, np.newaxis, :]
+                        D2 = np.square(fancy_index - statistics)
+                        #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2
+                        t2 = time()
+                        time_1 = time_1 + (t2-t1)
                         D2[np.isnan(D2)] = 0
                         missing_row, missing_col = np.where(np.isnan(X_sl))
                         sqdist = D2.sum(axis=2)
@@ -427,7 +433,7 @@ def transform(self, X):
                             continue
                         else:
                             missing_index = np.where(group_index == group_number)[0]
-                            batch_slice = list(gen_batches(len(missing_index), 100))
+                            batch_slice = list(gen_batches(len(missing_index), batch_size))
                             for sl in batch_slice:
                                 index_sl = missing_index[sl]
                                 X_sl = X[index_sl]
@@ -456,4 +462,5 @@ def transform(self, X):
 
                 X[coordinates] = values
 
-        return X
\ No newline at end of file
+        return X
+

From 430a2a0d93006d0661b350ca01821be051fd6c55 Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Tue, 16 Jun 2015 14:36:13 -0400
Subject: [PATCH 08/17] delete test code

---
 sklearn/preprocessing/imputation.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index c90d1fa7f5cd4..5377f9bdc198b 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -412,12 +412,7 @@ def transform(self, X):
                         X_sl = X[index_start: index_stop]
                         mask_sl = _get_mask(X_sl, self.missing_values)
                         missing_index_sl = np.where(mask_sl.any(1))[0]
-                        t1 = time()
-                        fancy_index = X_sl[missing_index_sl][:, np.newaxis, :]
-                        D2 = np.square(fancy_index - statistics)
-                        #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2
-                        t2 = time()
-                        time_1 = time_1 + (t2-t1)
+                        D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2
                         D2[np.isnan(D2)] = 0
                         missing_row, missing_col = np.where(np.isnan(X_sl))
                         sqdist = D2.sum(axis=2)

From 85039ac769fd39bd351cd1e7344dcf3282a7188c Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Tue, 16 Jun 2015 17:32:08 -0400
Subject: [PATCH 09/17] change example/missing_value; using diabetes dataset,
 and use random matrix to create na. add comparision between knn and mean
 imputation

---
 examples/missing_values.py | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/examples/missing_values.py b/examples/missing_values.py
index 8952590e7dcfb..651b51b3f4399 100644
--- a/examples/missing_values.py
+++ b/examples/missing_values.py
@@ -18,7 +18,7 @@
   Score with the entire dataset = 0.43
   Score without the samples containing missing values = 0.36
   Score after mean imputation of the missing values = 0.42
-  Score after knn imputation with 10 neighbors of the missing values = 0.43
+  Score after knn imputation with 7 neighbors of the missing values = 0.43
 
 In this case, imputing helps the classifier get close to the original score.
 
@@ -38,23 +38,19 @@
 n_samples = X_full.shape[0]
 n_features = X_full.shape[1]
 
+#Create a random matrix to randomly make missing values
+missing_matrix = np.random.rand(n_samples, n_features)
+th = 0.15  # each sample has (1-th)^n_features of probability to have full features
+mask = missing_matrix < th
+missing_samples = mask.any(1)
+full_percentage = (n_samples - missing_samples.sum())/float(n_samples)
+print("Percentage of samples with full features: %f" %full_percentage )
+
 # Estimate the score on the entire dataset, with no missing values
 estimator = RandomForestRegressor(random_state=0, n_estimators=100)
 score = cross_val_score(estimator, X_full, y_full).mean()
 print("Score with the entire dataset = %.2f" % score)
 
-# Add missing values in 60% of the lines
-missing_rate = 0.60 # 60% of samples have missing value
-missing2_rate = 0.80 # 80% of samples with missing value have 2 missing features
-n_missing_samples = np.floor(n_samples * missing_rate)
-n_missing2_samples = np.floor(n_samples * missing_rate * missing2_rate)
-missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
-                                      dtype=np.bool),
-                             np.ones(n_missing_samples,
-                                     dtype=np.bool)))
-rng.shuffle(missing_samples)
-missing_features = rng.randint(0, n_features, n_missing_samples)
-missing2_features = rng.randint(0, n_features, n_missing2_samples)
 
 # Estimate the score without the lines containing missing values
 X_filtered = X_full[~missing_samples, :]
@@ -64,12 +60,11 @@
 print("Score without the samples containing missing values = %.2f" % score)
 
 # Estimate the score after mean imputation of the missing values
-missing_index = np.where(missing_samples)[0]
-missing2_index = np.random.choice(missing_index, n_missing2_samples)
+
 X_missing = X_full.copy()
-X_missing[np.where(missing_samples)[0], missing_features] = np.nan
-X_missing[missing2_index, missing2_features] = np.nan
+X_missing[mask] = np.nan
 y_missing = y_full.copy()
+
 estimator = Pipeline([("imputer", Imputer(strategy="mean",
                                           axis=0)),
                       ("forest", RandomForestRegressor(random_state=0,

From 998fba610b162b7320f03241d8aa003dc59f20ef Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Wed, 17 Jun 2015 12:27:54 -0400
Subject: [PATCH 10/17] avoid _get_mask for each iteration; add axis kw for
 np.any

---
 sklearn/preprocessing/imputation.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 5377f9bdc198b..8e4b575d4bb76 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -105,13 +105,14 @@ class Imputer(BaseEstimator, TransformerMixin):
         - If `axis=1` and X is encoded as a CSC matrix.
 
     n_neighbors : int, optional (default=1)
-        It only has effect if the strategy is "knn". It controls the number of nearest
+        It only has effect if the `strategy=knn`. It controls the number of nearest
         neighbors used to compute the mean along the axis.
 
     Attributes
     ----------
     statistics_ : array of shape (n_features,)
         The imputation fill value for each feature if axis == 0.
+        If `strategy=knn`, then it contains those samples having no missing value.
 
     Notes
     -----
@@ -120,6 +121,7 @@ class Imputer(BaseEstimator, TransformerMixin):
     - When ``axis=1``, an exception is raised if there are rows for which it is
       not possible to fill in the missing values (e.g., because they only
       contain missing values).
+    - Knn strategy currently doesn't support sparse matrix.
     """
     def __init__(self, missing_values="NaN", strategy="mean",
                  axis=0, verbose=0, copy=True, n_neighbors=1):
@@ -316,7 +318,7 @@ def _dense_fit(self, X, strategy, missing_values, axis):
             if axis == 1:
                 X = X.copy().transpose()
 
-            full_data = X[np.logical_not(mask.any(1))]
+            full_data = X[np.logical_not(mask.any(axis=1))]
             if full_data.size == 0:
                 raise ValueError("There is no sample with complete data.")
             if full_data.shape[0] < self.n_neighbors:
@@ -405,21 +407,21 @@ def transform(self, X):
 
                 batch_size = 10  # set batch size for block query
                 if False:
-                    missing_index = np.where(mask.any(1))[0]
+                    missing_index = np.where(mask.any(axis=1))[0]
                     #@jnothman 's method
                     for sl in list(gen_batches(len(missing_index), batch_size)):
-                        index_start, index_stop = missing_index[sl][0], missing_index[sl][-1]+1
-                        X_sl = X[index_start: index_stop]
-                        mask_sl = _get_mask(X_sl, self.missing_values)
-                        missing_index_sl = np.where(mask_sl.any(1))[0]
-                        D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2
+                        X_sl = X[missing_index[sl]]
+                        test1 = X_sl[:][:, np.newaxis, :] - statistics
+                        D2 = test1 ** 2
+                        #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2
                         D2[np.isnan(D2)] = 0
                         missing_row, missing_col = np.where(np.isnan(X_sl))
                         sqdist = D2.sum(axis=2)
                         ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
                         means = np.mean(statistics[ind], axis=1)
-                        X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl[missing_index_sl]))[0],
+                        X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0],
                                                                missing_col]
+                        X[missing_index[sl]] = X_sl
                 else:
                     # group by missing features and batch within group
                     group_index = np.unique(mask.astype('u1').view((np.void, X.shape[1])), return_inverse=True)[1]

From a64ee12816605878d2aa76af2a62a64d29e9e3c6 Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Wed, 19 Aug 2015 15:02:57 -0400
Subject: [PATCH 11/17] preallocate output array

---
 sklearn/preprocessing/imputation.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 8e4b575d4bb76..96b2f452b1074 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -405,14 +405,19 @@ def transform(self, X):
                     mask = mask.transpose()
                     statistics = statistics.transpose()
 
-                batch_size = 10  # set batch size for block query
-                if False:
+                batch_size = 20  # set batch size for block query
+                if True:
                     missing_index = np.where(mask.any(axis=1))[0]
                     #@jnothman 's method
+                    D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0], statistics.shape[1]]))
                     for sl in list(gen_batches(len(missing_index), batch_size)):
                         X_sl = X[missing_index[sl]]
                         test1 = X_sl[:][:, np.newaxis, :] - statistics
-                        D2 = test1 ** 2
+                        #D2 = np.empty_like(test1)
+                        if test1.shape != D2.shape:
+                            D2 = np.empty_like(test1)
+                        np.multiply(test1, test1, out=D2)
+                        #D2 = test1 ** 2
                         #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2
                         D2[np.isnan(D2)] = 0
                         missing_row, missing_col = np.where(np.isnan(X_sl))
@@ -434,7 +439,10 @@ def transform(self, X):
                             for sl in batch_slice:
                                 index_sl = missing_index[sl]
                                 X_sl = X[index_sl]
-                                D2 = (X_sl[:][:, np.newaxis, :] - statistics) ** 2
+                                test1 = X_sl[:][:, np.newaxis, :] - statistics
+                                D2 = np.empty_like(test1)
+                                np.multiply(test1, test1, out=D2)
+                                #D2 = test1 ** 2
                                 D2[np.isnan(D2)] = 0
                                 missing_row, missing_col = np.where(np.isnan(X_sl))
                                 sqdist = D2.sum(axis=2)

From eacf3e8d6156199be68b6e2ae14cdd4776ebff1e Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Fri, 21 Aug 2015 13:50:22 -0400
Subject: [PATCH 12/17] add documentation

---
 doc/modules/preprocessing.rst       |  7 +++---
 examples/missing_values.py          | 18 +++++++++-----
 sklearn/preprocessing/imputation.py | 37 +++++++++++++++++++----------
 3 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index cc0cf59dd5f2b..757e0d03d9148 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -388,9 +388,10 @@ values. However, this comes at the price of losing data which may be valuable
 i.e., to infer them from the known part of the data.
 
 The :class:`Imputer` class provides basic strategies for imputing missing
-values, either using the mean, the median or the most frequent value of
-the row or column in which the missing values are located. This class
-also allows for different missing values encodings.
+values, either using the mean, the median, the most frequent value of
+the row or column in which the missing values are located or the mean of the
+k-nearest neighbors computed using samples without missing values. This class also
+allows for different missing values encodings.
 
 The following snippet demonstrates how to replace missing values,
 encoded as ``np.nan``, using the mean value of the columns (axis 0)
diff --git a/examples/missing_values.py b/examples/missing_values.py
index 651b51b3f4399..57411b5d2de5a 100644
--- a/examples/missing_values.py
+++ b/examples/missing_values.py
@@ -16,7 +16,7 @@
 Script output::
 
   Score with the entire dataset = 0.43
-  Score without the samples containing missing values = 0.36
+  Score without the samples containing missing values = 0.35
   Score after mean imputation of the missing values = 0.42
   Score after knn imputation with 7 neighbors of the missing values = 0.43
 
@@ -39,20 +39,24 @@
 n_features = X_full.shape[1]
 
 #Create a random matrix to randomly make missing values
-missing_matrix = np.random.rand(n_samples, n_features)
-th = 0.15  # each sample has (1-th)^n_features of probability to have full features
+missing_matrix = rng.rand(n_samples, n_features)
+
+# each sample has (1-th)^n_features of probability to have full features
+th = 0.14
 mask = missing_matrix < th
-missing_samples = mask.any(1)
+missing_samples = mask.any(axis=1)
 full_percentage = (n_samples - missing_samples.sum())/float(n_samples)
 print("Percentage of samples with full features: %f" %full_percentage )
 
 # Estimate the score on the entire dataset, with no missing values
+
 estimator = RandomForestRegressor(random_state=0, n_estimators=100)
 score = cross_val_score(estimator, X_full, y_full).mean()
 print("Score with the entire dataset = %.2f" % score)
 
 
 # Estimate the score without the lines containing missing values
+
 X_filtered = X_full[~missing_samples, :]
 y_filtered = y_full[~missing_samples]
 estimator = RandomForestRegressor(random_state=0, n_estimators=100)
@@ -73,10 +77,12 @@
 print("Score after mean imputation of the missing values = %.2f" % score)
 
 # Estimate the score after knn imputation of the missing values
-neigh = 7
+
+neigh = 7  # Number of neighbors to be used
 estimator2 = Pipeline([("imputer", Imputer(strategy="knn",
                                            axis=0, n_neighbors=neigh)),
                       ("forest", RandomForestRegressor(random_state=0,
                                                        n_estimators=100))])
 score = cross_val_score(estimator2, X_missing, y_missing).mean()
-print("Score after knn imputation with %d neighbors of the missing values = %.2f" % (neigh, score))
+print("Score after knn imputation with %d neighbors of the missing values ="
+      " %.2f" % (neigh, score))
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 96b2f452b1074..bc07071c48501 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -82,8 +82,8 @@ class Imputer(BaseEstimator, TransformerMixin):
           the axis.
         - If "most_frequent", then replace missing using the most frequent
           value along the axis.
-        - If "knn", then replace missing using the mean of the k-nearest neighbors
-          along the axis.
+        - If "knn", then replace missing using the mean of the k-nearest
+          neighbors along the axis.
 
     axis : integer, optional (default=0)
         The axis along which to impute.
@@ -105,8 +105,8 @@ class Imputer(BaseEstimator, TransformerMixin):
         - If `axis=1` and X is encoded as a CSC matrix.
 
     n_neighbors : int, optional (default=1)
-        It only has effect if the `strategy=knn`. It controls the number of nearest
-        neighbors used to compute the mean along the axis.
+        It only has effect if the `strategy=knn`. It controls the number of
+        nearest neighbors used to compute the mean along the axis.
 
     Attributes
     ----------
@@ -257,8 +257,10 @@ def _sparse_fit(self, X, strategy, missing_values, axis):
 
                 return most_frequent
 
+            # KNN
             elif strategy == "knn":
-                raise ValueError("strategy='knn' does not support sparse matrix input")
+                raise ValueError("strategy='knn' does not support sparse "
+                                 "matrix input")
 
 
     def _dense_fit(self, X, strategy, missing_values, axis):
@@ -316,14 +318,18 @@ def _dense_fit(self, X, strategy, missing_values, axis):
         elif strategy == "knn":
 
             if axis == 1:
-                X = X.copy().transpose()
+                X = X.transpose()
+                mask = mask.transpose()
 
+            # Get samples with complete features
             full_data = X[np.logical_not(mask.any(axis=1))]
             if full_data.size == 0:
                 raise ValueError("There is no sample with complete data.")
             if full_data.shape[0] < self.n_neighbors:
-                raise ValueError("There are only %d complete samples, but n_neighbors=%d."
+                raise ValueError("There are only %d complete samples, "
+                                 "but n_neighbors=%d."
                                  % (full_data.shape[0], self.n_neighbors))
+            # Transpose back
             if axis == 1:
                 full_data = full_data.transpose()
 
@@ -405,20 +411,26 @@ def transform(self, X):
                     mask = mask.transpose()
                     statistics = statistics.transpose()
 
-                batch_size = 20  # set batch size for block query
+                batch_size = 200  # set batch size for block query
                 if True:
                     missing_index = np.where(mask.any(axis=1))[0]
-                    #@jnothman 's method
-                    D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0], statistics.shape[1]]))
+                    # @jnothman 's method
+                    D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0],
+                                                 statistics.shape[1]]))
+                    # Preallocate output array for np.multiply(test1, test1, out=D2)
                     for sl in list(gen_batches(len(missing_index), batch_size)):
                         X_sl = X[missing_index[sl]]
                         test1 = X_sl[:][:, np.newaxis, :] - statistics
                         #D2 = np.empty_like(test1)
+
+                        # For the last slice, the length may not be the same
+                        # as batch_size
                         if test1.shape != D2.shape:
                             D2 = np.empty_like(test1)
                         np.multiply(test1, test1, out=D2)
                         #D2 = test1 ** 2
-                        #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2
+                        #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics)
+                        #  ** 2
                         D2[np.isnan(D2)] = 0
                         missing_row, missing_col = np.where(np.isnan(X_sl))
                         sqdist = D2.sum(axis=2)
@@ -429,7 +441,8 @@ def transform(self, X):
                         X[missing_index[sl]] = X_sl
                 else:
                     # group by missing features and batch within group
-                    group_index = np.unique(mask.astype('u1').view((np.void, X.shape[1])), return_inverse=True)[1]
+                    group_index = np.unique(mask.astype('u1').view((np.void, X.shape[1])),
+                                            return_inverse=True)[1]
                     for group_number in range(max(group_index)+1):
                         if group_number == 0:
                             continue

From c442bbb1b12b42f0dc5373dbd11f79b2dbed426f Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Mon, 24 Aug 2015 11:43:56 -0400
Subject: [PATCH 13/17] fix not nan imputation

---
 sklearn/preprocessing/imputation.py           |  4 +++
 .../preprocessing/tests/test_imputation.py    | 28 +++++++++++++++----
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index bc07071c48501..b7fa2c4473a7f 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -420,6 +420,8 @@ def transform(self, X):
                     # Preallocate output array for np.multiply(test1, test1, out=D2)
                     for sl in list(gen_batches(len(missing_index), batch_size)):
                         X_sl = X[missing_index[sl]]
+                        mask_sl = mask[missing_index[sl]]
+                        X_sl[mask_sl] = np.nan
                         test1 = X_sl[:][:, np.newaxis, :] - statistics
                         #D2 = np.empty_like(test1)
 
@@ -452,6 +454,8 @@ def transform(self, X):
                             for sl in batch_slice:
                                 index_sl = missing_index[sl]
                                 X_sl = X[index_sl]
+                                mask_sl = mask[missing_index[sl]]
+                                X_sl[mask_sl] = np.nan
                                 test1 = X_sl[:][:, np.newaxis, :] - statistics
                                 D2 = np.empty_like(test1)
                                 np.multiply(test1, test1, out=D2)
diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py
index e66d12e16b58d..e907e20d695bc 100644
--- a/sklearn/preprocessing/tests/test_imputation.py
+++ b/sklearn/preprocessing/tests/test_imputation.py
@@ -377,6 +377,13 @@ def test_imputation_knn():
         [0,  2,  -1,  6],
     ])
 
+    X5 = np.array([
+        [999, -1,  0,  5],
+        [0,  2, -1,  3],
+        [-1,  -1,  0, 5],
+        [-1,  2,  3,  7],
+    ])
+
     X_true_1 = np.array([
         [-1, -1,  0,  5],
         [0,  2, -1,  3],
@@ -398,26 +405,37 @@ def test_imputation_knn():
         [0,  2, -1,  6],
     ])
 
-    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1)
+    imputer = Imputer(missing_values='NaN', strategy="knn",
+                      axis=0, n_neighbors=1)
     X_impute = imputer.fit(X).transform(X)
     assert_array_equal(X_true_1, X_impute)
 
-    imputer = Imputer(missing_values='NaN', strategy="knn", axis=1, n_neighbors=1)
+    imputer = Imputer(missing_values='NaN', strategy="knn",
+                      axis=1, n_neighbors=1)
     X_impute = imputer.fit(X.transpose()).transform(X.transpose())
     assert_array_equal(X_true_1.transpose(), X_impute)
 
-    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=2)
+    imputer = Imputer(missing_values='NaN', strategy="knn",
+                      axis=0, n_neighbors=2)
     X_impute = imputer.fit(X).transform(X)
     assert_array_equal(X_true_2, X_impute)
 
-    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1)
+    imputer = Imputer(missing_values='NaN', strategy="knn",
+                      axis=0, n_neighbors=1)
     X_impute = imputer.fit(X2).transform(X2)
     assert_array_equal(X_true_1, X_impute)
 
-    imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1)
+    imputer = Imputer(missing_values='NaN', strategy="knn",
+                      axis=0, n_neighbors=1)
     X_impute = imputer.fit(X4).transform(X4)
     assert_array_equal(X_true_4, X_impute)
 
+    imputer = Imputer(missing_values=999, strategy="knn",
+                      axis=0, n_neighbors=1, copy=False)
+    X5 = X5.astype(float)
+    X_impute = imputer.fit(X5).transform(X5)
+    assert_array_equal(X_true_1, X5)
+
     imputer = Imputer(missing_values='NaN', strategy="knn", axis=0)
     msg = "There is no sample with complete data."
     assert_raise_message(ValueError, msg, imputer.fit, X3)

From 5a2906cd915da4229b04a469df63540db1c2567c Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Mon, 24 Aug 2015 12:08:06 -0400
Subject: [PATCH 14/17] change variable name; add user guide doc

---
 doc/modules/preprocessing.rst       |  3 +++
 sklearn/preprocessing/imputation.py | 23 +++++++++++------------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 757e0d03d9148..5b8ef636f874b 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -425,5 +425,8 @@ Note that, here, missing values are encoded by 0 and are thus implicitly stored
 in the matrix. This format is thus suitable when there are many more missing
 values than observed values.
 
+Also, knn imputation strategy will use samples with full features, and if all samples
+have missing features, this strategy will fail.
+
 :class:`Imputer` can be used in a Pipeline as a way to build a composite
 estimator that supports imputation. See :ref:`example_missing_values.py`
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index b7fa2c4473a7f..6fc7351c847d4 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -422,22 +422,21 @@ def transform(self, X):
                         X_sl = X[missing_index[sl]]
                         mask_sl = mask[missing_index[sl]]
                         X_sl[mask_sl] = np.nan
-                        test1 = X_sl[:][:, np.newaxis, :] - statistics
-                        #D2 = np.empty_like(test1)
+                        impute_dist = X_sl[:][:, np.newaxis, :] - statistics
 
                         # For the last slice, the length may not be the same
                         # as batch_size
-                        if test1.shape != D2.shape:
-                            D2 = np.empty_like(test1)
-                        np.multiply(test1, test1, out=D2)
+                        if impute_dist.shape != D2.shape:
+                            D2 = np.empty_like(impute_dist)
+                        np.multiply(impute_dist, impute_dist, out=D2)
                         #D2 = test1 ** 2
                         #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics)
                         #  ** 2
                         D2[np.isnan(D2)] = 0
                         missing_row, missing_col = np.where(np.isnan(X_sl))
                         sqdist = D2.sum(axis=2)
-                        ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
-                        means = np.mean(statistics[ind], axis=1)
+                        target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
+                        means = np.mean(statistics[target_index], axis=1)
                         X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0],
                                                                missing_col]
                         X[missing_index[sl]] = X_sl
@@ -456,15 +455,15 @@ def transform(self, X):
                                 X_sl = X[index_sl]
                                 mask_sl = mask[missing_index[sl]]
                                 X_sl[mask_sl] = np.nan
-                                test1 = X_sl[:][:, np.newaxis, :] - statistics
-                                D2 = np.empty_like(test1)
-                                np.multiply(test1, test1, out=D2)
+                                impute_dist = X_sl[:][:, np.newaxis, :] - statistics
+                                D2 = np.empty_like(impute_dist)
+                                np.multiply(impute_dist, impute_dist, out=D2)
                                 #D2 = test1 ** 2
                                 D2[np.isnan(D2)] = 0
                                 missing_row, missing_col = np.where(np.isnan(X_sl))
                                 sqdist = D2.sum(axis=2)
-                                ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
-                                means = np.mean(statistics[ind], axis=1)
+                                target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
+                                means = np.mean(statistics[target_index], axis=1)
                                 X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0],
                                                                        missing_col]
                                 X[index_sl] = X_sl

From b7ff8e180038c80feb0f12eaff7f41c33297f269 Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Tue, 25 Aug 2015 11:50:18 -0400
Subject: [PATCH 15/17] clean up comment; remove sort by missing column; remove
 list for gen_batches

---
 sklearn/preprocessing/imputation.py | 84 +++++++++--------------------
 1 file changed, 26 insertions(+), 58 deletions(-)

diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 6fc7351c847d4..b9119ca3abbae 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -411,64 +411,32 @@ def transform(self, X):
                     mask = mask.transpose()
                     statistics = statistics.transpose()
 
-                batch_size = 200  # set batch size for block query
-                if True:
-                    missing_index = np.where(mask.any(axis=1))[0]
-                    # @jnothman 's method
-                    D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0],
-                                                 statistics.shape[1]]))
-                    # Preallocate output array for np.multiply(test1, test1, out=D2)
-                    for sl in list(gen_batches(len(missing_index), batch_size)):
-                        X_sl = X[missing_index[sl]]
-                        mask_sl = mask[missing_index[sl]]
-                        X_sl[mask_sl] = np.nan
-                        impute_dist = X_sl[:][:, np.newaxis, :] - statistics
-
-                        # For the last slice, the length may not be the same
-                        # as batch_size
-                        if impute_dist.shape != D2.shape:
-                            D2 = np.empty_like(impute_dist)
-                        np.multiply(impute_dist, impute_dist, out=D2)
-                        #D2 = test1 ** 2
-                        #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics)
-                        #  ** 2
-                        D2[np.isnan(D2)] = 0
-                        missing_row, missing_col = np.where(np.isnan(X_sl))
-                        sqdist = D2.sum(axis=2)
-                        target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
-                        means = np.mean(statistics[target_index], axis=1)
-                        X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0],
-                                                               missing_col]
-                        X[missing_index[sl]] = X_sl
-                else:
-                    # group by missing features and batch within group
-                    group_index = np.unique(mask.astype('u1').view((np.void, X.shape[1])),
-                                            return_inverse=True)[1]
-                    for group_number in range(max(group_index)+1):
-                        if group_number == 0:
-                            continue
-                        else:
-                            missing_index = np.where(group_index == group_number)[0]
-                            batch_slice = list(gen_batches(len(missing_index), batch_size))
-                            for sl in batch_slice:
-                                index_sl = missing_index[sl]
-                                X_sl = X[index_sl]
-                                mask_sl = mask[missing_index[sl]]
-                                X_sl[mask_sl] = np.nan
-                                impute_dist = X_sl[:][:, np.newaxis, :] - statistics
-                                D2 = np.empty_like(impute_dist)
-                                np.multiply(impute_dist, impute_dist, out=D2)
-                                #D2 = test1 ** 2
-                                D2[np.isnan(D2)] = 0
-                                missing_row, missing_col = np.where(np.isnan(X_sl))
-                                sqdist = D2.sum(axis=2)
-                                target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
-                                means = np.mean(statistics[target_index], axis=1)
-                                X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0],
-                                                                       missing_col]
-                                X[index_sl] = X_sl
-
-
+                batch_size = 1  # set batch size for block query
+                missing_index = np.where(mask.any(axis=1))[0]
+                D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0],
+                                             statistics.shape[1]]))
+
+                # Preallocate output array for np.multiply(test1, test1, out=D2)
+                for sl in gen_batches(len(missing_index), batch_size):
+                    X_sl = X[missing_index[sl]]
+                    mask_sl = mask[missing_index[sl]]
+                    X_sl[mask_sl] = np.nan
+                    impute_dist = X_sl[:][:, np.newaxis, :] - statistics
+
+                    # For the last slice, the length may not be the same
+                    # as batch_size
+                    if impute_dist.shape != D2.shape:
+                        D2 = np.empty_like(impute_dist)
+
+                    np.multiply(impute_dist, impute_dist, out=D2)
+                    D2[np.isnan(D2)] = 0
+                    missing_row, missing_col = np.where(np.isnan(X_sl))
+                    sqdist = D2.sum(axis=2)
+                    target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors]
+                    means = np.mean(statistics[target_index], axis=1)
+                    X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0],
+                                                           missing_col]
+                    X[missing_index[sl]] = X_sl
 
                 if self.axis == 1:
                     X = X.transpose()

From 3776dec99c5910f99f7c2e11fb611270c5c0bee7 Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Wed, 26 Aug 2015 11:23:14 -0400
Subject: [PATCH 16/17] modify documentation

---
 doc/modules/preprocessing.rst                 | 11 +++---
 sklearn/preprocessing/imputation.py           | 18 +++++----
 .../preprocessing/tests/test_imputation.py    | 37 ++++++++++---------
 3 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 5b8ef636f874b..bbe65ec15f9ca 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -388,10 +388,10 @@ values. However, this comes at the price of losing data which may be valuable
 i.e., to infer them from the known part of the data.
 
 The :class:`Imputer` class provides basic strategies for imputing missing
-values, either using the mean, the median, the most frequent value of
-the row or column in which the missing values are located or the mean of the
-k-nearest neighbors computed using samples without missing values. This class also
-allows for different missing values encodings.
+values. It can use the mean, the median, the most frequent value of
+the row or column in which the missing values are located. Alternatively it can fill
+with the mean of only the k-nearest neighbors computed using samples without missing
+values. The placeholder for missing values is configurable.
 
 The following snippet demonstrates how to replace missing values,
 encoded as ``np.nan``, using the mean value of the columns (axis 0)
@@ -425,7 +425,8 @@ Note that, here, missing values are encoded by 0 and are thus implicitly stored
 in the matrix. This format is thus suitable when there are many more missing
 values than observed values.
 
-Also, knn imputation strategy will use samples with full features, and if all samples
+When using ``strategy=knn``, only samples without any missing features will be used for imputation.
+If all samples
 have missing features, this strategy will fail.
 
 :class:`Imputer` can be used in a Pipeline as a way to build a composite
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index b9119ca3abbae..ab0a3d1602058 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -83,13 +83,14 @@ class Imputer(BaseEstimator, TransformerMixin):
         - If "most_frequent", then replace missing using the most frequent
           value along the axis.
         - If "knn", then replace missing using the mean of the k-nearest
-          neighbors along the axis.
+          neighbors along the axis. Only samples with no missing values are
+          considered as neighbors.
 
     axis : integer, optional (default=0)
         The axis along which to impute.
 
-        - If `axis=0`, then impute along columns.
-        - If `axis=1`, then impute along rows.
+        - If ``axis=0``, then impute along columns.
+        - If ``axis=1``, then impute along rows.
 
     verbose : integer, optional (default=0)
         Controls the verbosity of the imputer.
@@ -101,18 +102,18 @@ class Imputer(BaseEstimator, TransformerMixin):
 
         - If X is not an array of floating values;
         - If X is sparse and `missing_values=0`;
-        - If `axis=0` and X is encoded as a CSR matrix;
-        - If `axis=1` and X is encoded as a CSC matrix.
+        - If ``axis=0`` and X is encoded as a CSR matrix;
+        - If ``axis=1`` and X is encoded as a CSC matrix.
 
     n_neighbors : int, optional (default=1)
-        It only has effect if the `strategy=knn`. It controls the number of
-        nearest neighbors used to compute the mean along the axis.
+        Controls the number of nearest neighbors used to compute the mean
+        along the axis. Only used when ``strategy=knn``
 
     Attributes
     ----------
     statistics_ : array of shape (n_features,)
         The imputation fill value for each feature if axis == 0.
-        If `strategy=knn`, then it contains those samples having no missing value.
+        If ``strategy=knn``, then it contains those samples having no missing value.
 
     Notes
     -----
@@ -412,6 +413,7 @@ def transform(self, X):
                     statistics = statistics.transpose()
 
                 batch_size = 1  # set batch size for block query
+
                 missing_index = np.where(mask.any(axis=1))[0]
                 D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0],
                                              statistics.shape[1]]))
diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py
index e907e20d695bc..0db13ff950390 100644
--- a/sklearn/preprocessing/tests/test_imputation.py
+++ b/sklearn/preprocessing/tests/test_imputation.py
@@ -356,6 +356,13 @@ def test_imputation_knn():
         [-1,  2,  3,  7],
     ])
 
+    X_true_1 = np.array([
+        [-1, -1,  0,  5],
+        [0,  2, -1,  3],
+        [-1,  -1,  0, 5],
+        [-1,  2,  3,  7],
+    ])
+
     X2 = np.array([
         [np.nan, -1,  0,  np.nan],
         [0,  2, -1,  3],
@@ -363,6 +370,13 @@ def test_imputation_knn():
         [-1,  2,  3,  7],
     ])
 
+    X_true_2 = np.array([
+        [-0.5, -1,  0,  5],
+        [0,  2, -1,  3],
+        [-1,  -1,  0, 5],
+        [-1,  2,  3,  7],
+    ])
+
     X3 = np.array([
         [np.nan, -1,  0,  5],
         [0,  np.nan, -1,  3],
@@ -377,34 +391,20 @@ def test_imputation_knn():
         [0,  2,  -1,  6],
     ])
 
-    X5 = np.array([
-        [999, -1,  0,  5],
-        [0,  2, -1,  3],
-        [-1,  -1,  0, 5],
-        [-1,  2,  3,  7],
-    ])
-
-    X_true_1 = np.array([
+    X_true_4 = np.array([
         [-1, -1,  0,  5],
         [0,  2, -1,  3],
         [-1,  -1,  0, 5],
-        [-1,  2,  3,  7],
+        [0,  2, -1,  6],
     ])
 
-    X_true_2 = np.array([
-        [-0.5, -1,  0,  5],
+    X5 = np.array([
+        [999, -1,  0,  5],
         [0,  2, -1,  3],
         [-1,  -1,  0, 5],
         [-1,  2,  3,  7],
     ])
 
-    X_true_4 = np.array([
-        [-1, -1,  0,  5],
-        [0,  2, -1,  3],
-        [-1,  -1,  0, 5],
-        [0,  2, -1,  6],
-    ])
-
     imputer = Imputer(missing_values='NaN', strategy="knn",
                       axis=0, n_neighbors=1)
     X_impute = imputer.fit(X).transform(X)
@@ -435,6 +435,7 @@ def test_imputation_knn():
     X5 = X5.astype(float)
     X_impute = imputer.fit(X5).transform(X5)
     assert_array_equal(X_true_1, X5)
+    assert_array_equal(X_impute, X5)
 
     imputer = Imputer(missing_values='NaN', strategy="knn", axis=0)
     msg = "There is no sample with complete data."

From ef290f3d4fc693eefc1db36fe3bcbc19c65f7815 Mon Sep 17 00:00:00 2001
From: Tian Wang <tw991@nyu.edu>
Date: Wed, 26 Aug 2015 11:25:10 -0400
Subject: [PATCH 17/17] modify documentation

---
 examples/missing_values.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/missing_values.py b/examples/missing_values.py
index 57411b5d2de5a..1c208887d94ee 100644
--- a/examples/missing_values.py
+++ b/examples/missing_values.py
@@ -45,8 +45,8 @@
 th = 0.14
 mask = missing_matrix < th
 missing_samples = mask.any(axis=1)
-full_percentage = (n_samples - missing_samples.sum())/float(n_samples)
-print("Percentage of samples with full features: %f" %full_percentage )
+full_percentage = (n_samples - missing_samples.sum()) / float(n_samples)
+print("Percentage of samples with full features: %f" % full_percentage)
 
 # Estimate the score on the entire dataset, with no missing values