scikit-learn · ashimb9 · Jun 24, 2017 · Jun 25, 2017 · Jun 25, 2017 · Jul 13, 2017
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
@@ -16,6 +16,14 @@ values. However, this comes at the price of losing data which may be valuable
 i.e., to infer them from the known part of the data. See the :ref:`glossary`
 entry on imputation.
 
+Imputer transformers can be used in a Pipeline as a way to build a composite
+estimator that supports imputation.  See
+:ref:`sphx_glr_auto_examples_plot_missing_values.py`.
+
+
+Simple univariate imputation
+============================
+
 The :class:`SimpleImputer` class provides basic strategies for imputing missing
 values. Missing values can be imputed with a provided constant value, or using
 the statistics (mean, median or most frequent) of each column in which the
@@ -75,6 +83,48 @@ string values or pandas categoricals when using the ``'most_frequent'`` or
 :class:`SimpleImputer` can be used in a Pipeline as a way to build a composite
 estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`.
 
+.. _knnimpute:
+
+Nearest neighbors imputation
+===============================
+
+The :class:`KNNImputer` class provides imputation for completing missing
+values using the k-Nearest Neighbors approach. Each sample's missing values
+are imputed using values from ``n_neighbors`` nearest neighbors found in the
+training set. Note that if a sample has more than one feature missing, then
+the sample can potentially have multiple sets of ``n_neighbors``
+donors depending on the particular feature being imputed.
+
+Each missing feature is then imputed as the average, either weighted or
+unweighted, of these neighbors. Where the number of donor neighbors is less
+than ``n_neighbors``, the training set average for that feature is used
+for imputation. The total number of samples in the training set is, of course,
+always greater than or equal to the number of nearest neighbors available for
+imputation, depending on both the overall sample size as well as the number of
+samples excluded from nearest neighbor calculation because of too many missing
+features (as controlled by ``row_max_missing``).
+For more information on the methodology, see ref. [#]_.
+
+The following snippet demonstrates how to replace missing values,
+encoded as ``np.nan``, using the mean feature value of the two nearest
+neighbors of the rows that contain the missing values::
+
+    >>> import numpy as np
+    >>> from sklearn.impute import KNNImputer
+    >>> nan = np.nan
+    >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
+    >>> imputer = KNNImputer(n_neighbors=2, weights="uniform")
+    >>> imputer.fit_transform(X)
+    array([[1. , 2. , 4. ],
+           [3. , 4. , 3. ],
+           [5.5, 6. , 5. ],
+           [8. , 8. , 7. ]])
+
+.. [#] Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
+    Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value
+    estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001
+    Pages 520-525.
+
 .. _missing_indicator:
 
 Marking imputed values

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -672,6 +672,7 @@ Imputation of missing values
 
 Tools for imputing missing values are discussed at :ref:`impute`.
 
+
 .. _polynomial_features:
 
 Generating polynomial features

diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py
@@ -14,18 +14,20 @@
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
+With ``KNNImputer``, missing values can be imputed using the weighted
+or unweighted mean of the desired number of nearest neighbors.
+
 In addition of using an imputing method, we can also keep an indication of the
 missing information using :func:`sklearn.impute.MissingIndicator` which might
 carry some information.
 """
 import numpy as np
 import matplotlib.pyplot as plt
-
 from sklearn.datasets import load_diabetes
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import make_pipeline, make_union
-from sklearn.impute import SimpleImputer, MissingIndicator
+from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
 from sklearn.model_selection import cross_val_score
 
 rng = np.random.RandomState(0)
@@ -72,10 +74,18 @@ def get_results(dataset):
                                          scoring='neg_mean_squared_error',
                                          cv=5)
 
+    # Estimate the score after kNN-imputation of the missing values
+    knn_estimator = make_pipeline(
+        KNNImputer(missing_values=0, col_max_missing=0.99),
+        RandomForestRegressor(random_state=0, n_estimators=100))
+    knn_impute_scores = cross_val_score(knn_estimator, X_missing, y_missing,
+                                        scoring='neg_mean_squared_error')
 
     return ((full_scores.mean(), full_scores.std()),
             (zero_impute_scores.mean(), zero_impute_scores.std()),
-            (mean_impute_scores.mean(), mean_impute_scores.std()))
+            (mean_impute_scores.mean(), mean_impute_scores.std()),
+            (knn_impute_scores.mean(), knn_impute_scores.std()),
+            )
 
 
 results_diabetes = np.array(get_results(load_diabetes()))
@@ -91,8 +101,10 @@ def get_results(dataset):
 
 x_labels = ['Full data',
             'Zero imputation',
-            'Mean Imputation']
-colors = ['r', 'g', 'b', 'orange']
+            'Mean Imputation',
+            'KNN Imputation',
+            ]
+colors = ['r', 'g', 'b', 'orange', 'black']
 
 # plot diabetes results
 plt.figure(figsize=(12, 6))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -672,6 +672,7 @@ Imputation of missing values

		Tools for imputing missing values are discussed at :ref:`impute`.


		.. _polynomial_features:

		Generating polynomial features
Expand Down