diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 0ca707ce2cbbf..120573c1d09c5 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -48,6 +48,11 @@ Model evaluation and meta-estimators
 - A scorer based on :func:`metrics.brier_score_loss` is also available.
   :issue:`9521` by :user:`Hanmin Qin <qinhanmin2014>`.
 
+- The ``axis`` parameter in
+  :class:`preprocessing.Imputer <preprocessing.Imputer>` is deprecated. Its
+  removal is planned for 0.22 release. :issue:`9672` by
+  :user:`Baze Petrushev <petrushev>`.
+
 Bug fixes
 .........
 
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
index 12d5425fbf604..881ccc31aed97 100644
--- a/sklearn/preprocessing/imputation.py
+++ b/sklearn/preprocessing/imputation.py
@@ -82,12 +82,16 @@ class Imputer(BaseEstimator, TransformerMixin):
         - If "most_frequent", then replace missing using the most frequent
           value along the axis.
 
-    axis : integer, optional (default=0)
+    axis : integer, optional (default=None)
         The axis along which to impute.
 
         - If `axis=0`, then impute along columns.
         - If `axis=1`, then impute along rows.
 
+        .. deprecated:: 0.20
+           ``axis`` will be removed from ``Imputer``, and it will only impute
+           along columns (i.e., ``axis=0``) in 0.22.
+
     verbose : integer, optional (default=0)
         Controls the verbosity of the imputer.
 
@@ -115,7 +119,7 @@ class Imputer(BaseEstimator, TransformerMixin):
       contain missing values).
     """
     def __init__(self, missing_values="NaN", strategy="mean",
-                 axis=0, verbose=0, copy=True):
+                 axis=None, verbose=0, copy=True):
         self.missing_values = missing_values
         self.strategy = strategy
         self.axis = axis
@@ -143,14 +147,22 @@ def fit(self, X, y=None):
                              " got strategy={1}".format(allowed_strategies,
                                                         self.strategy))
 
-        if self.axis not in [0, 1]:
+        if self.axis is None:
+            self._axis = 0
+        else:
+            warnings.warn("'axis' will be removed from Imputer, and it will "
+                          "only impute along columns (axis=0) in 0.22",
+                          DeprecationWarning)
+            self._axis = self.axis
+
+        if self._axis not in [0, 1]:
             raise ValueError("Can only impute missing values on axis 0 and 1, "
-                             " got axis={0}".format(self.axis))
+                             " got axis={0}".format(self._axis))
 
         # Since two different arrays can be provided in fit(X) and
         # transform(X), the imputation data will be computed in transform()
         # when the imputation is done per sample (i.e., when axis=1).
-        if self.axis == 0:
+        if self._axis == 0:
             X = check_array(X, accept_sparse='csc', dtype=np.float64,
                             force_all_finite=False)
 
@@ -158,12 +170,12 @@ def fit(self, X, y=None):
                 self.statistics_ = self._sparse_fit(X,
                                                     self.strategy,
                                                     self.missing_values,
-                                                    self.axis)
+                                                    self._axis)
             else:
                 self.statistics_ = self._dense_fit(X,
                                                    self.strategy,
                                                    self.missing_values,
-                                                   self.axis)
+                                                   self._axis)
 
         return self
 
@@ -306,7 +318,7 @@ def transform(self, X):
         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
             The input data to complete.
         """
-        if self.axis == 0:
+        if self._axis == 0:
             check_is_fitted(self, 'statistics_')
             X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
                             force_all_finite=False, copy=self.copy)
@@ -326,27 +338,27 @@ def transform(self, X):
                 statistics = self._sparse_fit(X,
                                               self.strategy,
                                               self.missing_values,
-                                              self.axis)
+                                              self._axis)
 
             else:
                 statistics = self._dense_fit(X,
                                              self.strategy,
                                              self.missing_values,
-                                             self.axis)
+                                             self._axis)
 
         # Delete the invalid rows/columns
         invalid_mask = np.isnan(statistics)
         valid_mask = np.logical_not(invalid_mask)
         valid_statistics = statistics[valid_mask]
         valid_statistics_indexes = np.where(valid_mask)[0]
-        missing = np.arange(X.shape[not self.axis])[invalid_mask]
+        missing = np.arange(X.shape[not self._axis])[invalid_mask]
 
-        if self.axis == 0 and invalid_mask.any():
+        if self._axis == 0 and invalid_mask.any():
             if self.verbose:
                 warnings.warn("Deleting features without "
                               "observed values: %s" % missing)
             X = X[:, valid_statistics_indexes]
-        elif self.axis == 1 and invalid_mask.any():
+        elif self._axis == 1 and invalid_mask.any():
             raise ValueError("Some rows only contain "
                              "missing values: %s" % missing)
 
@@ -363,10 +375,10 @@ def transform(self, X):
                 X = X.toarray()
 
             mask = _get_mask(X, self.missing_values)
-            n_missing = np.sum(mask, axis=self.axis)
+            n_missing = np.sum(mask, axis=self._axis)
             values = np.repeat(valid_statistics, n_missing)
 
-            if self.axis == 0:
+            if self._axis == 0:
                 coordinates = np.where(mask.transpose())[::-1]
             else:
                 coordinates = mask